diff --git a/.gitignore b/.gitignore
index cecd6fa91c754d0862d26a10833a83aa3ced819c..801790d0a472080af607e9fbcde0284902a4ead8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,10 +6,14 @@ paddle/fluid/eager/api/generated/*
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/phi/api/backward/backward_api.h
+paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
+paddle/phi/api/include/sparse_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
+paddle/phi/api/lib/sparse_api.cc
+paddle/phi/api/lib/sparse_bw_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*
@@ -49,6 +53,10 @@ tools/__pycache__
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
 paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
+paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
+tools/infrt/kernels.json
+tools/infrt/kernel_signature.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b499fb43ab996b1c1780c0276faad2c37a8808a..6988434996bcc4745726b34278eb6007fdf8605f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
@@ -238,7 +239,8 @@ option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
 option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
-option(NEW_RELEASE_CUBIN   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_PYPI   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_ALL   "PaddlePaddle next-level release strategy for all arches cubin package"             OFF)
 option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
 option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
diff --git a/README.md b/README.md
index 7dc83aa695cef8ecf177dfc2c444888850342bdc..cdbf2d9f3bf9973fb6c7fe2365ea61f05ce998c1 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md)
 Welcome to the PaddlePaddle GitHub.
 
 PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 2.3 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
 
diff --git a/README_cn.md b/README_cn.md
index 6b37cfd97b35729dd293452178646db8f1194ca3..3834ee148f940326a2b1e1a8d0fd63a1028b0c96 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -15,7 +15,7 @@
 
 欢迎来到 PaddlePaddle GitHub
 
-飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者265万，服务企业10万家，基于飞桨开源深度学习平台产生了34万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
+飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者406万，服务企业15.7万家，基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
 
 ## 安装
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 34c079ba71cf8ff1789ef31b9abb71dc171edfe6..312a0305244684c88e8926d2a71db377b0dd6be1 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,16 +6,22 @@ if(WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
-elseif(NEW_RELEASE_CUBIN)
+elseif(NEW_RELEASE_ALL)
+  message("Using New Release Strategy - All Arches Packge")
+  add_definitions(-DNEW_RELEASE_ALL)
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
-  add_definitions(-DNEW_RELEASE_CUBIN)
-  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "50 60 70 75")
-  set(paddle_known_gpu_archs11 "60 70 75 80")
+  add_definitions(-DNEW_RELEASE_PYPI)
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "")
+  set(paddle_known_gpu_archs11 "60 61 70 75 80")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Packge")
   add_definitions(-DNEW_RELEASE_JIT)
-  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
   set(paddle_known_gpu_archs10 "35 50 60 70 75")
   set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
 else()
@@ -148,7 +154,7 @@ function(select_nvcc_arch_flags out_variable)
 
   # remove dots and convert to lists
   string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
   string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
 
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index a7a9e85ffd7314ac7026fccdf45fae2fa3de09d3..9f6fd32ad986c4a5911b1d00dfb548fa3320c34d 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -100,8 +100,8 @@ endfunction()
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
   mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
-  add_public_tablegen_target(${td_base}_IncGen)
-  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+  add_public_tablegen_target(MLIR${td_base}IncGen)
+  add_dependencies(mlir-headers MLIR${td_base}IncGen)
 endfunction()
 
 # Execute the mlir script with infrt-exec program.
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2162f87812d130f19262955798f28e2c2adc4bac
--- /dev/null
+++ b/cmake/external/onnxruntime.cmake
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if (NOT WITH_ONNXRUNTIME)
+  return()
+endif ()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+add_definitions(-DPADDLE_WITH_ONNXRUNTIME)
+
+SET(ONNXRUNTIME_PROJECT        "extern_onnxruntime")
+SET(ONNXRUNTIME_PREFIX_DIR     ${THIRD_PARTY_PATH}/onnxruntime)
+SET(ONNXRUNTIME_SOURCE_DIR     ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
+SET(ONNXRUNTIME_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/onnxruntime)
+SET(ONNXRUNTIME_INC_DIR        "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE)
+SET(ONNXRUNTIME_LIB_DIR        "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE)
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
+
+
+if (WIN32)
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip")
+elseif (APPLE)
+  SET(ONNXRUNTIME_URL           "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz")
+else ()
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz")
+endif()
+
+
+INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers.
+if (WIN32)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+elseif (APPLE)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+else ()
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+endif ()
+
+if (WIN32)
+  ExternalProject_Add(
+      ${ONNXRUNTIME_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      URL                 ${ONNXRUNTIME_URL}
+      PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} &&
+                            ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
+                            ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+      BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+else ()
+  ExternalProject_Add(
+    ${ONNXRUNTIME_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                 ${ONNXRUNTIME_URL}
+    PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS  1
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND        ""
+    INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
+                          ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+    BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+endif()
+
+ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
+ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT})
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..661c3675c84b27a7ed8210fec0cfeaa2c858487c
--- /dev/null
+++ b/cmake/external/paddle2onnx.cmake
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_ONNXRUNTIME)
+  return()
+endif()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+SET(PADDLE2ONNX_PROJECT        "extern_paddle2onnx")
+SET(PADDLE2ONNX_PREFIX_DIR     ${THIRD_PARTY_PATH}/paddle2onnx)
+SET(PADDLE2ONNX_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/paddle2onnx)
+SET(PADDLE2ONNX_INC_DIR        "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE)
+SET(PADDLE2ONNX_REPOSITORY     ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
+SET(PADDLE2ONNX_TAG            cpp)
+SET(LIBDIR "lib")
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
+
+INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers.
+if(WIN32)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE)
+    SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE)
+elseif(APPLE)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+else()
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+endif(WIN32)
+
+
+# The protoc path is required to compile onnx.
+string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
+list(POP_BACK PROTOC_BIN_PATH)
+list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
+
+
+set(PADDLE2ONNX_OPTIONAL_ARGS
+      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+      -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
+      -DWITH_STATIC=OFF
+      -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+      ${EXTERNAL_OPTIONAL_ARGS}
+)
+
+if (WITH_PYTHON)
+  set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
+    -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
+    -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}
+  )
+endif ()
+
+
+ExternalProject_Add(
+    ${PADDLE2ONNX_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY      ${PADDLE2ONNX_REPOSITORY}
+    GIT_TAG             ${PADDLE2ONNX_TAG}
+    DEPENDS             protobuf
+    PREFIX              ${PADDLE2ONNX_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS       ${PADDLE2ONNX_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS    ${PADDLE2ONNX_LIB}
+)
+
+ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index f7cb7716969f5ccaa97d1ad7964510376b86870a..58ff5f0d2b715d117018eb2ff3d5989c8beb0694 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-    if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+
+    if(WITH_ONNXRUNTIME)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         v3.18.0)
+    elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
         SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
         SET(PROTOBUF_TAG         v3.8.0)
     elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
@@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ONNXRUNTIME)
+    SET(PROTOBUF_VERSION 3.18.0)
+elseif(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
     SET(PROTOBUF_VERSION 3.6.1)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 415c0fe9bef9eab89e670d8b3f6f7c330b316ed8..cfbe68eecbaca55c5a288aae2c985bbc33d37be2 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f7c17bd7cfe7e099e0afeaf623724e12387aff44..ba59eae392c66354b419bbfd2688a14a26f2e388 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -580,8 +580,8 @@ function(hip_library TARGET_NAME)
     cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     if(hip_library_SRCS)
       # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
-      if(NOT ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators")
-        set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
+       set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
       endif()
       if (hip_library_SHARED OR hip_library_shared) # build *.so
         hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
@@ -651,6 +651,7 @@ function(hip_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH")
   endif()
 endfunction(hip_test)
 
@@ -667,6 +668,7 @@ function(xpu_library TARGET_NAME)
       else()
         xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
         find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (xpu_library_DEPS)
         add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c48d31f7e4f90296ecc48acb56e619aae129106e..851bd81403a85e52fbbb3c4c8bf0da1df63c8848 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST)
         endif()
     endif()
 
+    if (WITH_ONNXRUNTIME)
+        set(dst_dir "${DST}/third_party/install/onnxruntime")
+        copy(${TARGET}
+                SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
+                DSTS ${dst_dir} ${dst_dir})
+
+        set(dst_dir "${DST}/third_party/install/paddle2onnx")
+        if(WIN32)
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
+        else()
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib)
+        endif()
+    endif()
+
     set(dst_dir "${DST}/third_party/install/gflags")
     copy(${TARGET}
             SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7affd59de162d5956672e5abfbf9f4b287fb7a83..1291e60cfe4ce13ca9aeeb3f8bdf068af0d5832c 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -293,11 +293,11 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op"
     "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
-    
-            if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-                set(pybind_flag 1)
-            endif()
-        endforeach()
+
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
@@ -478,7 +478,7 @@ function(op_library TARGET)
     if (${pybind_flag} EQUAL 0)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+        file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       elseif(${TARGET} STREQUAL "fake_quantize")
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index d9132b84455e7309713b99f9e574bfceb83c7b6c..ebb686d8ad0f31917e64161d6f7d2ecd4644fadd 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./gpudnn\/")
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./kps\/")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -97,6 +99,7 @@ function(kernel_library TARGET)
     set(gpu_srcs)
     set(xpu_srcs)
     set(gpudnn_srcs)
+    set(kps_srcs)
     set(selected_rows_srcs)
     # parse and save the deps kerenl targets
     set(all_srcs)
@@ -128,8 +131,11 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
             endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
-                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+            endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
+                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
             endif()
         endif()
         if (WITH_XPU)
@@ -137,6 +143,15 @@ function(kernel_library TARGET)
                 list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
             endif()
         endif()
+        if (WITH_XPU_KP)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                # Change XPU2 file suffix
+                # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
+                file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+                list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+            endif()
+        endif()
     else()
         # TODO(chenweihang): impl compile by source later
     endif()
@@ -150,6 +165,7 @@ function(kernel_library TARGET)
     list(APPEND all_srcs ${gpu_srcs})
     list(APPEND all_srcs ${xpu_srcs})
     list(APPEND all_srcs ${gpudnn_srcs})
+    list(APPEND all_srcs ${kps_srcs})
     foreach(src ${all_srcs})
         file(READ ${src} target_content)
         string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
@@ -159,11 +175,11 @@ function(kernel_library TARGET)
             string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
         endif()
         foreach(include_kernel ${include_kernels})
-        if ("${kernel_library_SUB_DIR}" STREQUAL "")
-            string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
-        else()
-            string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
-        endif()
+            if ("${kernel_library_SUB_DIR}" STREQUAL "")
+                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
+            else()
+                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
+            endif()
             string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
             list(APPEND kernel_deps ${kernel_name})
         endforeach()
@@ -176,72 +192,93 @@ function(kernel_library TARGET)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
     list(LENGTH gpudnn_srcs gpudnn_srcs_len)
+    list(LENGTH kps_srcs kps_srcs_len)
     list(LENGTH selected_rows_srcs selected_rows_srcs_len)
 
-    # Build Target according different src organization
-    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
-        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
-        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
+    # kernel source file level
+    # level 1: base device kernel
+    # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
+    # level 2: device-independent kernel
+    # - common_srcs
+    # level 3: Kernel implemented by reusing device-independent kernel
+    # - selected_rows_srcs
+    set(base_device_kernels)
+    set(device_independent_kernel)
+    set(high_level_kernels)
+
+    # 1. Base device kernel compile
+    if (${cpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_cpu)
+    endif()
+    if (${gpu_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpu)
+    endif()
+    if (${xpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_xpu)
+    endif()
+    if (${gpudnn_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If the selected_rows_srcs depends on common_srcs, build target using this rule.
-    elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpudnn)
+    endif()
+    if (${kps_srcs_len} GREATER 0)
+        # only when WITH_XPU_KP, the kps_srcs_len can be > 0
+        xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_kps)
+    endif()
+
+    # 2. Device-independent kernel compile
+    if (${common_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         else()
-            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         endif()
-    # If there are only common_srcs or selected_rows_srcs, build target using below rules.
-    elseif (${common_srcs_len} GREATER 0)
+        list(APPEND device_independent_kernel ${TARGET}_common)
+    endif()
+
+    # 3. Reusing kernel compile
+    if (${selected_rows_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         endif()
-    elseif (${selected_rows_srcs_len} GREATER 0)
+        list(APPEND high_level_kernels ${TARGET}_sr)
+    endif()
+
+    # 4. Unify target compile
+    list(LENGTH base_device_kernels base_device_kernels_len)
+    list(LENGTH device_independent_kernel device_independent_kernel_len)
+    list(LENGTH high_level_kernels high_level_kernels_len)
+    if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
+        ${high_level_kernels_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         else()
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         endif()
     else()
         set(target_build_flag 0)
@@ -249,7 +286,7 @@ function(kernel_library TARGET)
 
     if (${target_build_flag} EQUAL 1)
         if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
+            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
             ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
             # append target into PHI_KERNELS property
             get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
@@ -275,6 +312,9 @@ function(kernel_library TARGET)
         if (${gpudnn_srcs_len} GREATER 0)
             kernel_declare(${gpudnn_srcs})
         endif()
+        if (${kps_srcs_len} GREATER 0)
+            kernel_declare(${kps_srcs})
+        endif()
         if (${selected_rows_srcs_len} GREATER 0)
             kernel_declare(${selected_rows_srcs})
         endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index ac3eff04d5383ecdf6c771babcaf3e6811600ac3..7df095c6c2ec04e1a694ed2458787af285c96a9a 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE)
     list(APPEND third_party_deps extern_gtest)
 ENDIF()
 
+if(WITH_ONNXRUNTIME)
+    include(external/onnxruntime)            # download, build, install onnxruntime、paddle2onnx
+    include(external/paddle2onnx)          
+    list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
+endif()
+
 if(WITH_GPU)
     if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
         include(external/cub)       # download cub
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 41652f8b6ed6f717ad8a571be8e7a16408b34504..f88c993d85e2fa6eda27b7e845ee27f08347fa83 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,5 +1,12 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+if (WITH_DISTRIBUTE)
+  cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
+endif()
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
+if(WITH_ASCEND_CL)
+    cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
+endif()
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
new file mode 100644
index 0000000000000000000000000000000000000000..09789bd4d378630f548f931bcac00fda89ef33be
--- /dev/null
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <error.h>
+#include <string>
+
+#include "boost/variant.hpp"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index, device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index, device_index_));
+
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index, device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index, device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks, int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank,
+                                                       &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index dde8622d9007e1372739d0fedde4938f85eda323..e43d0e8c183c7005f31b66c4c29dfc95361485e4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -96,7 +96,54 @@ class ProcessGroup {
       std::vector<Tensor>& /* tensors */,
       const BroadcastOptions& = BroadcastOptions()) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support allreduce", GetBackendName()));
+        "ProcessGroup%s does not support broadcast", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support barrier", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<Tensor>& tensors /* tensors */, int dst_rank) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<Tensor>& tensors /* tensors */, int src_rank) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support receive", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors /* tensors */,     // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in /* tensors */,     // NOLINT
+      std::vector<Tensor>& out /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors /* tensors */,  // NOLINT
+      const ReduceOptions& opts) {                 // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<Tensor>& in_tensors /* tensors */,   // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */,  // NOLINT
+      const ScatterOptions&) {                         // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Scatter", GetBackendName()));
   }
 
  protected:
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dc43af117825bf95407255e93e1e4600e8ddd9a
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -0,0 +1,502 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <gloo/broadcast.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+
+#define HOST_NAME_MAX 256
+
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+
+bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kCPU;
+  });
+}
+
+template <typename T>
+T* get_data(const Tensor& tensor) {
+  auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+  return static_cast<T*>(raw_tensor->data());
+}
+
+template <typename T>
+std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
+  std::vector<T*> ret(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret[i] = get_data<T>(tensors[i]);
+  }
+  return ret;
+}
+
+template <typename T, typename P>
+void set_output(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_input(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_outputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs_for_scatter(P& opts,                             // NOLINT
+                            const std::vector<Tensor>& tensors,  // NOLINT
+                            int nranks) {
+  std::vector<T*> ret(nranks);
+  auto raw_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensors[0].impl());
+  T* raw_pointer = reinterpret_cast<T*>(raw_tensor->data());
+  size_t offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    ret[i] = raw_pointer + offset;
+    offset += tensors[0].numel() / nranks;
+  }
+  opts.setInputs(ret, tensors[0].numel() / nranks);
+}
+
+ProcessGroupGloo::GlooTask::GlooTask(int rank,
+                                     const std::vector<Tensor>& inputs,
+                                     CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {
+  PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
+                    platform::errors::Fatal(
+                        "Only CPU place is supported for ProcessGroupGloo."));
+}
+
+ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
+                                   int rank, int world_size,
+                                   const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    const std::vector<Tensor>& inputs, int rank, int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _tag(tag) {}
+
+  void Run() override { _do_broadcast(_inputs[0]); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<Tensor> _inputs{};
+  const uint32_t _tag;
+
+  void _do_broadcast(const Tensor& tensor) {
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = tensor.type();
+    GENERATE_FUNC(dtype, set_output, opts, tensor);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs, ReduceOp reduce_op,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+
+  void Run() override { _do_allreduce(_inputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_allreduce(std::vector<Tensor>& tensors) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, tensors);
+    GENERATE_FUNC(dtype, set_outputs, opts, tensors);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
+                                             opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+
+class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
+      : ProcessGroupGloo::GlooTask(rank, std::vector<Tensor>{},
+                                   CommType::BARRIER),
+        _context(context) {}
+
+  void Run() override { _do_barrier(); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+
+  void _do_barrier() {
+    gloo::BarrierOptions opts(_context);
+    gloo::barrier(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
+    const BarrierOptions& opts) {
+  std::shared_ptr<BarrierGlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<BarrierGlooTask>(rank_, context);
+  task->Run();
+  return task;
+}
+
+class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllgatherGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs,   // NOLINT
+                    std::vector<Tensor>& outputs,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+
+  void Run() override { _do_allgather(_inputs, _outputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  std::vector<Tensor> _outputs;
+  uint32_t _tag;
+
+  void _do_allgather(std::vector<Tensor>& in,     // NOLINT
+                     std::vector<Tensor>& out) {  // NOLINT
+    const auto& dtype = in[0].type();
+    gloo::AllgatherOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, in[0]);
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setTag(_tag);
+    gloo::allgather(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  std::shared_ptr<AllgatherGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<AllgatherGlooTask>(rank_, context, in_tensors,
+                                             out_tensors, tag);
+  task->Run();
+  return task;
+}
+
+class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ReduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                 std::vector<Tensor>& in, ReduceOp reduce_op,  // NOLINT
+                 int dst, uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE),
+        _context(context),
+        _inputs(in),
+        _reduce_op(reduce_op),
+        _dst(dst),
+        _tag(tag) {}
+
+  void Run() override { _do_reduce(_inputs, _dst); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  int _dst;
+  uint32_t _tag;
+
+  gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
+                                          const ReduceOp op) {
+    gloo::ReduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::ReduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_reduce(std::vector<Tensor>& tensors, int dst) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::ReduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, tensors[0]);
+    GENERATE_FUNC(dtype, set_output, opts, tensors[0]);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    opts.setRoot(dst);
+    gloo::reduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  std::shared_ptr<ReduceGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ReduceGlooTask>(rank_, context, tensors,
+                                          opts.reduce_op, opts.root_rank, tag);
+  task->Run();
+  return task;
+}
+
+class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ScatterGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                  std::vector<Tensor>& inputs,   // NOLINT
+                  std::vector<Tensor>& outputs,  // NOLINT
+                  int src, int size, uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _src(src),
+        _size(size),
+        _tag(tag) {}
+
+  void Run() override { _do_scatter(_inputs, _outputs, _src); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  std::vector<Tensor> _outputs;
+  int _src;
+  int _size;
+  uint32_t _tag;
+
+  void _do_scatter(std::vector<Tensor>& in, std::vector<Tensor>& out,  // NOLINT
+                   int src) {
+    const auto& dtype = in[0].type();
+    gloo::ScatterOptions opts(_context);
+    if (rank_ == src) {
+      GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setRoot(src);
+    opts.setTag(_tag);
+    gloo::scatter(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  std::shared_ptr<ScatterGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ScatterGlooTask>(
+      rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
+  task->Run();
+  return task;
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
+                                "Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
new file mode 100644
index 0000000000000000000000000000000000000000..24f156571a427128f09cd28e632212f47fa4cd47
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <future>
+#include <mutex>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+
+namespace paddle {
+namespace distributed {
+
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
+                      CommType comm_type);
+
+    ~GlooTask() = default;
+
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+
+   protected:
+    friend class ProcessGroupGloo;
+  };
+
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(
+        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+        : _store(store) {}
+
+    ~GlooStore() = default;
+
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+
+   protected:
+    std::shared_ptr<paddle::distributed::TCPStore> _store;
+  };
+
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+
+  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
+                            int world_size,
+                            std::shared_ptr<GlooOptions> options);
+
+  ~ProcessGroupGloo() = default;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& inputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& inputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<GlooStore> _store;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2deeb7ca03003d0b6c8fa0948afa0a3394639f8b
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -0,0 +1,354 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+
+DECLARE_bool(hccl_blocking_wait);
+// DECLARE_bool(use_stream_safe_npu_allocator);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+static HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(), true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
+//   return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+//     return t.place() == platform::DeviceType::NPU;
+//   });
+// }
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<NPUEventManager>& hcclEvents,                   // NOLINT
+    std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    hcclEvents[i].Record(*dev_ctx[i]);
+    hcclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
+    std::vector<Place> places, int rank, CommType comm_type,
+    const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupHCCL::HCCLTask>(places, rank, comm_type,
+                                                      inputs);
+}
+
+ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector<Place>& places, int rank,
+                                     CommType CommType,
+                                     const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  hcclComms_.resize(places.size());
+}
+
+ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
+
+void ProcessGroupHCCL::HCCLTask::SetOutputs(
+    std::vector<Tensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
+}
+
+void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    platform::NPUStreamWaitEvent(default_ctx->stream(),
+                                 control_events_[i].GetRawNPUEvent());
+  }
+}
+
+bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO(sandyhouse): Add timeout for wait, now timeout unused
+bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                                   int rank, int size)
+    : ProcessGroup(rank, size), store_(store) {}
+
+void ProcessGroupHCCL::BroadcastUniqueHCCLID(
+    std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto hccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
+      store_->set(key, hccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&hccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+
+// create HCCLManager cache for places_key
+void ProcessGroupHCCL::CreateHCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+
+  std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
+  hccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<HcclRootInfo> hccl_ids;
+  hccl_ids.resize(1);
+  auto& hccl_id = hccl_ids.front();
+
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
+  }
+  BroadcastUniqueHCCLID(hccl_ids);
+
+  VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
+
+  std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id,
+                                            comms.get() + i);
+    dev_ctx[i].reset(new NPUDeviceContext(places[i]));
+  }
+
+  std::vector<NPUEventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
+    std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < inputs.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::PointToPoint(
+    std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, tensors);
+
+  // construct uninitialize guard for device
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < tensors.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank);
+  }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     NPUPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclAllReduce(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     CudaPlace."));
+
+  return Collective(
+      tensors, tensors,
+      [&](Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        const auto root = opts.source_rank * tensors.size() + opts.source_root;
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclBroadcast(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
new file mode 100644
index 0000000000000000000000000000000000000000..83d509be2cdd7b79faf4e2a2f510c34361b94157
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+constexpr const char* HCCL_BACKEND_NAME = "HCCL";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+using NPUStream = platform::stream::NPUStream;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class HCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<HCCLTask> {
+   public:
+    HCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
+             const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams();
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize();
+
+    void SetOutputs(std::vector<Tensor>& outputs);  // NOLINT
+
+    virtual ~HCCLTask();
+
+    std::vector<NPUEventManager> control_events_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
+    std::shared_ptr<std::vector<Tensor>> outputs_;
+
+   private:
+  };
+
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size);
+
+  const std::string GetBackendName() const override {
+    return std::string(HCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
+      std::vector<Place> places, int rank, CommType opType,
+      const std::vector<Tensor>& inputs);
+
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<HCCLCommManager> hccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
+      places_to_hcclcomm_;
+
+  std::unordered_map<std::string, std::vector<NPUEventManager>>
+      places_to_events_;
+
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<NPUDeviceContext>>>
+      places_to_ctx_;
+
+  std::set<int> used_place_ids_;
+
+ private:
+  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<Tensor>& inputs,   // NOLINT
+      std::vector<Tensor>& outputs,  // NOLINT
+      Fn fn, CommType op_type);
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<Tensor>& tensors,  // NOLINT
+      Fn fn, int dst_rank, CommType op_type);
+
+  void CreateHCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index fe2325423b460d7b42e08b03cf9b083bc94fc7b6..67715f410d443c38a1c5d92c560a35a909c5ec1c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
 
 DECLARE_bool(nccl_blocking_wait);
 DECLARE_bool(use_stream_safe_cuda_allocator);
@@ -139,42 +142,41 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
       std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
     }
   }
+
+  if (!barrierTensors_.empty()) {
+    // If we use the work to do barrier, we should block cpu
+    for (auto& place : places_) {
+      platform::CUDADeviceGuard gpuGuard(place);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+    }
+  }
   return true;
 }
 
 // Same as Wait
 void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
-ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
+ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
                                    int rank, int size)
-    : ProcessGroup(rank, size), strategy_(strategy) {}
-
-void ProcessGroupNCCL::BcastNCCLId(
-    std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
-    int root, int server_fd) {
-  if (strategy_.local_rank_ == root) {
-    std::vector<std::string> other_trainers;
-    for (auto& ep : strategy_.trainer_endpoints_) {
-      if (ep != strategy_.current_endpoint_) {
-        other_trainers.push_back(ep);
-      }
-    }
-    platform::SendBroadCastCommID(other_trainers, &nccl_ids);
-  } else {
-    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
-                                  &nccl_ids);
-  }
-}
+    : ProcessGroup(rank, size), store_(store) {}
 
 void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
-
-  int server_fd = -1;
-  if (rank_ != 0) {
-    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
-                    .socket();
+  if (rank_ == 0) {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto nccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
+      store_->set(key, nccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&nccl_ids[i], ret.data(), ret.size());
+    }
   }
-  BcastNCCLId(nccl_ids, 0, server_fd);
 }
 
 // create NCCLManager cache for places_key
@@ -193,13 +195,17 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   nccl_ids.resize(1);
   auto& nccl_id = nccl_ids.front();
 
+  for (auto& place : places) {
+    used_place_ids_.insert(place.GetDeviceId());
+  }
+
   if (rank_ == 0) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
   }
   BroadcastUniqueNCCLID(nccl_ids);
 
-  VLOG(3) << "init nccl rank: " << strategy_.local_rank_
-          << ", nranks: " << strategy_.nranks_ << ", place: " << places_key
+  VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
           << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
 
   std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
@@ -274,6 +280,54 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
   return task;
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
+    std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, tensors);
+
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      auto dense_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+      memory::RecordStream(dense_tensor->Holder(),
+                           places_to_ctx_[key][i]->stream());
+    }
+  }
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+    }
+  }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    cuda_guard.SetDevice(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
     std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
   PADDLE_ENFORCE_EQ(
@@ -317,5 +371,241 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CommType::BROADCAST);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
+    const BarrierOptions& opts) {
+  std::vector<phi::GPUPlace> places;
+
+  if (!opts.place_ids.empty()) {
+    for (auto place_id : opts.place_ids) {
+      places.emplace_back(place_id);
+    }
+  } else if (!used_place_ids_.empty()) {
+    for (auto place_id : used_place_ids_) {
+      places.emplace_back(place_id);
+    }
+  } else {
+    auto numGPUs = GetSize();
+    int place_id = static_cast<int>(rank_ % numGPUs);
+    places.emplace_back(place_id);
+  }
+
+  std::vector<Tensor> barrierTensors;
+  barrierTensors.reserve(places.size());
+
+  platform::CUDADeviceGuard gpuGuard;
+  for (auto& place : places) {
+    gpuGuard.SetDeviceIndex(place.GetDeviceId());
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
+    barrierTensors.push_back(dt);
+  }
+  auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
+  auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
+  nccl_task->barrierTensors_ = std::move(barrierTensors);
+  return task;
+}
+
+void CheckTensorsInDifferentDevices(const std::vector<Tensor>& tensors,
+                                    const size_t num_devices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size() == 0, false,
+      platform::errors::InvalidArgument("Tensor list must be nonempty."));
+  PADDLE_ENFORCE_LE(
+      tensors.size(), num_devices,
+      platform::errors::InvalidArgument(
+          "Tensor list mustn't be larger than the number of available GPUs."));
+
+  std::set<Place> used_devices;
+
+  for (const auto& t : tensors) {
+    PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true,
+                      platform::errors::InvalidArgument(
+                          "Tensors must be CUDA and dense tensor."));
+
+    const auto inserted = used_devices.insert(t.inner_place()).second;
+    PADDLE_ENFORCE_EQ(inserted, true,
+                      platform::errors::InvalidArgument(
+                          "Tensors must be on distinct GPU devices."));
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
+    std::vector<Tensor>& tensors, int dst_rank) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](Tensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        return platform::dynload::ncclSend(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
+    std::vector<Tensor>& tensors, int src_rank) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](Tensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclRecv(
+            output_tensor->data(), output_tensor->numel(),
+            platform::ToNCCLDataType(output.type()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclAllGather(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), comm, stream);
+      },
+      CommType::ALLGATHER);
+}
+
+void* GetPointerByOffset(void* raw_pointer, size_t offset,
+                         experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          offset += input_tensor->numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input_tensor->data(), output_tensor->data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
+      },
+      CommType::REDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input_tensor->data(), offset, input.type()),
+                input_tensor->numel() / size_,
+                platform::ToNCCLDataType(input.type()), i, comm, stream));
+            offset += input_tensor->numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+        }
+      },
+      CommType::SCATTER);
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 9f06566d1c86386acad3758be283e716f46c1951..aa2a2b8fa2088cd30729ba5e6184ef7a9c507bf3 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -65,6 +66,7 @@ class ProcessGroupNCCL : public ProcessGroup {
     virtual ~NCCLTask();
 
     std::vector<EventManager> control_events_;
+    std::vector<Tensor> barrierTensors_;
 
    protected:
     std::vector<Place> places_;
@@ -74,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup {
    private:
   };
 
-  ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size);
 
   const std::string GetBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
@@ -88,13 +90,36 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<Tensor>& tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
+                                           int dst_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
+                                           int src_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
       const std::vector<Tensor>& inputs);
 
  protected:
-  ProcessGroupStrategy strategy_;
+  std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
   std::mutex mutex_;
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
@@ -106,6 +131,8 @@ class ProcessGroupNCCL : public ProcessGroup {
                      std::vector<std::unique_ptr<CUDADeviceContext>>>
       places_to_ctx_;
 
+  std::set<int> used_place_ids_;
+
  private:
   void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root,  // NOLINT
                    int server_fd);
@@ -118,6 +145,11 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<Tensor>& outputs,  // NOLINT
       Fn fn, CommType op_type);
 
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<Tensor>& tensors,  // NOLINT
+      Fn fn, int dst_rank, CommType op_type);
+
   void CreateNCCLManagerCache(const std::string& places_key,
                               const std::vector<Place>& places);
 };
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
index 654d06686957bd4242fa474c215ccf7c117e5910..973f7c643542757c0bce68f8ccdefeadc97f15d4 100644
--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -32,5 +32,18 @@ struct BroadcastOptions {
   int source_root = 0;
 };
 
+struct BarrierOptions {
+  std::vector<int> place_ids;
+};
+
+struct ReduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+  int root_rank = 0;
+};
+
+struct ScatterOptions {
+  int root_rank = 0;
+};
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59f3ea3b0a7d85651e7780b4b11875f19b70931e
--- /dev/null
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/reducer.h"
+#include "paddle/phi/common/data_type.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor> tensors,
+    const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), is_sparse_gradient.size(),
+      platform::errors::PreconditionNotMet(
+          "tensors len must be equal to is_sparse_gradient len, but "
+          "[%lu] != [%lu]",
+          tensors.size(), is_sparse_gradient.size()));
+  auto check_perm = [](const std::vector<int64_t> &x) -> bool {
+    size_t len = x.size();
+    std::vector<size_t> cnt(len, 0);
+    for (size_t i = 0; i < len; ++i) {
+      if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
+        return false;
+      }
+      cnt[x[i]]++;
+    }
+    return true;
+  };
+
+  PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
+                    platform::errors::PreconditionNotMet(
+                        "tensor_indices must be a permutation from 0 to %lu",
+                        tensor_indices.size()));
+  // the return vector
+  std::vector<std::vector<size_t>> res;
+
+  // Key: the var type
+  // Value: should use which index in group_size_limits for group size limit
+  std::map<experimental::DataType, size_t> group_limit_index;
+
+  // Key: the var type
+  // Value: <the var index in input tensors, total numel in this group>
+  std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
+      next_group;
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const auto &var = tensors[i];
+
+    size_t tensor_real_index = i;
+    if (!tensor_indices.empty()) {
+      tensor_real_index = tensor_indices[i];
+    }
+
+    if (is_sparse_gradient[tensor_real_index]) {
+      // we keep sparse var a single group
+      res.push_back({tensor_real_index});
+      continue;
+    }
+
+    const auto &var_dtype = var.dtype();
+    VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
+    auto &group_info = next_group[var_dtype];
+
+    int64_t var_size = -1;
+
+    if (var.is_dense_tensor()) {
+      var_size =
+          std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
+    } else {
+      VLOG(3) << "var " << var.name()
+              << " is not tensor or selected_rows, so skip it";
+      continue;
+    }
+
+    group_info.first.push_back(tensor_real_index);
+    group_info.second += experimental::SizeOf(var_dtype) * var_size;
+    // group_info.second += framework::SizeOfType(var_dtype) * var_size;
+
+    if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
+      // means it is the first var of var_dtype
+      group_limit_index[var_dtype] = 0;
+    }
+    auto &cur_limit_index = group_limit_index[var_dtype];
+    if (group_info.second >= group_size_limits[cur_limit_index]) {
+      // exceed group capacity and create a new group
+      res.emplace_back(std::move(group_info.first));
+      group_info = std::pair<std::vector<size_t>, size_t>();
+      cur_limit_index =
+          (std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
+    }
+  }
+
+  // add the final groups
+  for (auto &e : next_group) {
+    auto &group_info = e.second;
+    if (!group_info.first.empty()) {
+      res.emplace_back(std::move(group_info.first));
+    }
+  }
+
+  for (const auto &group_index : res) {
+    PADDLE_ENFORCE_NE(
+        group_index.empty(), true,
+        platform::errors::PreconditionNotMet(
+            "AssignGroupBySize construct empty group, please check."));
+  }
+  if (tensor_indices.empty()) {
+    std::sort(res.begin(), res.end(),
+              [](const std::vector<size_t> &x, const std::vector<size_t> &y) {
+                return x.front() < y.front();
+              });
+  }
+  return res;
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8c75385ef8bd6891df8eda6faa93c73091c37f5
--- /dev/null
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
+namespace paddle {
+namespace distributed {
+using Tensor = paddle::experimental::Tensor;
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
+    const std::vector<size_t>& group_size_limits,
+    const std::vector<int64_t>& tensor_indices = {});
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 56d8da3eca4b5a82ff6cdb8f4e3ff8638a02b437..0d5d328fd32cc2e12d4f4e94c94dae51f0c040bc 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
+
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -46,7 +48,8 @@ void Carrier::Init(
     const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
     const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
     const framework::ProgramDesc& program, framework::Scope* scope,
-    int64_t num_micro_batches, const platform::Place& place) {
+    int64_t num_micro_batches, const platform::Place& place,
+    const std::vector<std::string>& inference_root_scope_vars) {
   rank_ = rank;
   interceptor_id_to_rank_ = interceptor_id_to_rank;
   interceptor_id_to_node_ = interceptor_id_to_node;
@@ -60,7 +63,7 @@ void Carrier::Init(
   microbatch_scopes_.resize(num_micro_batches);
   for (int i = 0; i < num_micro_batches; ++i) {
     microbatch_scopes_[i] = &minibatch_scope_->NewScope();
-    CopyParameters(i, program);
+    CopyParameters(i, program, inference_root_scope_vars);
   }
 
   // TODO(fleet_exe dev): thread pool
@@ -80,12 +83,23 @@ void Carrier::Release() {
 
 Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
 
-void Carrier::CopyParameters(int microbatch_id,
-                             const framework::ProgramDesc& program) {
+void Carrier::CopyParameters(
+    int microbatch_id, const framework::ProgramDesc& program,
+    const std::vector<std::string>& inference_root_scope_vars) {
   auto& global_block = program.Block(0);
 
+  std::map<std::string, int> inference_root_scope_var_map;
+  for (auto var_name : inference_root_scope_vars) {
+    inference_root_scope_var_map.insert({var_name, 1});
+  }
   for (auto& var : global_block.AllVars()) {
-    if (var->Persistable() && microbatch_id == 0) {
+    std::string var_name = var->Name();
+    bool force_root = inference_root_scope_var_map.find(var_name) !=
+                      inference_root_scope_var_map.end();
+    if (force_root) {
+      VLOG(4) << var_name << " will be forced to be created in the root scope.";
+    }
+    if ((var->Persistable() || force_root) && microbatch_id == 0) {
       auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
       VLOG(5) << "Create persistable var: " << var->Name()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 9a74fa78c0e7638cd9c5201b92b06619c1f5b10c..d35a3260915e2cfd40bea9dc03fe6af7d9d04c54 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -57,9 +57,12 @@ class Carrier final {
       const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
       const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
       const framework::ProgramDesc& program, framework::Scope* scope,
-      int64_t num_micro_batches, const platform::Place& place);
+      int64_t num_micro_batches, const platform::Place& place,
+      const std::vector<std::string>& inference_root_scope_vars = {});
 
-  void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
+  void CopyParameters(
+      int microbatch_id, const framework::ProgramDesc& program,
+      const std::vector<std::string>& inference_root_scope_vars);
 
   void Release();
   void Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 457549a27b4b7ed6305b107cfd319ecae026a53b..e946d78550ff1bb0155843a680fbec33fdca9aa3 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <algorithm>
 
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
@@ -52,7 +53,8 @@ void FleetExecutor::Init(
     const std::string& carrier_id, const framework::ProgramDesc& program_desc,
     framework::Scope* scope, const platform::Place& place,
     int64_t num_micro_batches, const std::vector<TaskNode*>& task_nodes,
-    const std::unordered_map<int64_t, int64_t>& task_id_to_rank) {
+    const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+    const std::vector<std::string>& inference_root_scope_vars) {
   PADDLE_ENFORCE_GT(task_nodes.size(), 0,
                     platform::errors::InvalidArgument(
                         "Fleet executor is inited with empty task node"));
@@ -64,6 +66,37 @@ void FleetExecutor::Init(
     }
   }
   auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
+  // NOTE: For inference, the vars in inference_root_scope_vars
+  // shouldn't be deleted during inf, for that they may be the result of the
+  // inf. If they are GCed, it will cause error during ZeroCopy the result.
+  std::vector<const framework::OperatorBase*> changed_ops;
+  for (auto pair : unused_vars) {
+    const framework::OperatorBase* op = pair.first;
+    std::vector<std::string> unused = pair.second;
+    for (auto name : inference_root_scope_vars) {
+      auto iter = std::find(unused.begin(), unused.end(), name);
+      if (iter != unused.end()) {
+        VLOG(3) << "Removing var: [" << name
+                << "] from the unused vars list of op: [" << op->Type() << "]";
+        unused.erase(iter);
+        if (std::find(changed_ops.begin(), changed_ops.end(), op) ==
+            changed_ops.end()) {
+          // record the op whose unused vars have been updated
+          changed_ops.emplace_back(op);
+        }
+      }
+    }
+    // update the unused vars list in the map
+    unused_vars[op] = unused;
+  }
+  for (auto op : changed_ops) {
+    auto iter = unused_vars.find(op);
+    if (iter->second.empty()) {
+      // remove those ops in the map that have empty unused vars list
+      VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map.";
+      unused_vars.erase(iter);
+    }
+  }
   runtime_graph_ = std::make_shared<RuntimeGraph>();
   std::unordered_map<int64_t, TaskNode*> interceptor_id_to_task;
   for (auto task_node : task_nodes) {
@@ -82,17 +115,18 @@ void FleetExecutor::Init(
   carrier_ids_.insert(carrier_id);
   // Set current running carrier
   GlobalVal<std::string>::Set(new std::string(carrier_id));
-  InitCarrier(carrier, scope, place, num_micro_batches, program_desc);
+  InitCarrier(carrier, scope, place, num_micro_batches, program_desc,
+              inference_root_scope_vars);
   GlobalVal<MessageBus>::Get()->Barrier();
 }
 
-void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope,
-                                const platform::Place& place,
-                                int64_t num_micro_batches,
-                                const framework::ProgramDesc& program_desc) {
+void FleetExecutor::InitCarrier(
+    Carrier* carrier, framework::Scope* scope, const platform::Place& place,
+    int64_t num_micro_batches, const framework::ProgramDesc& program_desc,
+    const std::vector<std::string>& inference_root_scope_vars) {
   carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(),
                 runtime_graph_->interceptor_id_to_node(), program_desc, scope,
-                num_micro_batches, place);
+                num_micro_batches, place, inference_root_scope_vars);
 }
 
 void FleetExecutor::InitMessageBus() {
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index fa65309127bec50869c52d2f3c85477910ccb37b..ccdb3dcc459489db9f342a2302fae3d777170313 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -42,15 +42,17 @@ class FleetExecutor final {
             const framework::ProgramDesc& program_desc, framework::Scope* scope,
             const platform::Place& place, int64_t num_micro_batches,
             const std::vector<TaskNode*>& task_nodes,
-            const std::unordered_map<int64_t, int64_t>& task_id_to_rank);
+            const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+            const std::vector<std::string>& inference_root_scope_vars = {});
   void Run(const std::string& carrier_id);
 
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
   void InitMessageBus();
-  void InitCarrier(Carrier* carrier, framework::Scope* scope,
-                   const platform::Place& place, int64_t num_micro_batches,
-                   const framework::ProgramDesc& program_desc);
+  void InitCarrier(
+      Carrier* carrier, framework::Scope* scope, const platform::Place& place,
+      int64_t num_micro_batches, const framework::ProgramDesc& program_desc,
+      const std::vector<std::string>& inference_root_scope_vars = {});
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
   std::unordered_set<std::string> carrier_ids_;
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 6de7038b3231f2fb302dd970273c565c5a718b73..95e4c73305998e4190c1547cb2f92809e360b216 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -52,11 +52,20 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
   program_ = program;
 }
 
-void TaskNode::Init() {
+void TaskNode::Init(bool use_feed_fetch_ops) {
+  if (!use_feed_fetch_ops) {
+    VLOG(3) << "TaskNode will be inited without feed and fetch ops";
+  }
   if (ops_.empty()) {
     // Q (for fleet executor dev): should we need another reset funct?
     VLOG(3) << "Task node will be inited by calling Init().";
     for (const auto& op_desc : program_->Block(0).AllOps()) {
+      if (!use_feed_fetch_ops &&
+          (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+        VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], "
+                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
+        continue;
+      }
       ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
     }
     for (const auto& op : ops_vec_) {
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index b655d140d37a5bdf547a278eec3355ef4638539f..4764d4fd4af87adf3df31f2dabb614da7d719861 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -46,7 +46,7 @@ class TaskNode final {
   ~TaskNode() = default;
 
   void SetProgram(paddle::framework::ProgramDesc* program);
-  void Init();
+  void Init(bool use_feed_fetch_ops = true);
   int64_t rank() const { return rank_; }
   int64_t task_id() const { return task_id_; }
   int32_t role() const { return role_; }
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 18920d06f38543cc3f7aeb045e7c3058143e006e..ba039385a74ba45aa1f33ba38138d8e5213f2e00 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -24,10 +24,14 @@ limitations under the License. */
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(fill_constant);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
index 2673314d222d2b32e42c42a3a94df71a1887914a..7b4ae7e70ff6f033e038f1c5214f46e0876257d2 100644
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -25,13 +25,26 @@ namespace distributed {
 
 class Store {
  public:
-  Store() = delete;
+  Store() : _timeout(tcputils::kNoTimeout) {}
   explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
   virtual ~Store() = default;
 
-  virtual int64_t add(const std::string& key, int64_t value) = 0;
-  virtual std::vector<uint8_t> get(const std::string& key) = 0;
-  virtual void wait(const std::string& key) = 0;
+  virtual int64_t add(const std::string& key, int64_t value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual std::vector<uint8_t> get(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void wait(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void set(const std::string& key, const std::vector<uint8_t>& value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
 
   virtual const std::chrono::seconds& timeout() const { return _timeout; }
 
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index de85ac0d910e93257a308052ca1fcf193680a183..b0d5add49565ffb19762778ddd44a388b140c0ee 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -27,11 +27,13 @@ namespace detail {
 
 constexpr int INFTIME = -1;
 
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
-  return std::make_unique<MasterDaemon>(socket);
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
+                                                  int nranks) {
+  return std::make_unique<MasterDaemon>(socket, nranks);
 }
 
-MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks)
+    : _listen_socket(socket), _nranks(nranks) {
   _background_thread = std::thread{&MasterDaemon::run, this};
 }
 
@@ -64,27 +66,35 @@ void MasterDaemon::_do_add(SocketType socket) {
   tcputils::send_value<int64_t>(socket, new_value);
 }
 
+void MasterDaemon::_do_set(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_set";
+  std::string key = tcputils::receive_string(socket);
+  auto value = tcputils::receive_vector<uint8_t>(socket);
+  _store[key] = value;
+}
+
 void MasterDaemon::_do_get(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_get";
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
   PADDLE_ENFORCE_NE(
       iter, _store.end(),
       platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
   std::vector<uint8_t> value = iter->second;
-  VLOG(3) << "TCPStore: value ("
-          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
-                                    value.size()))
-          << ") for key (" << key << ").";
   tcputils::send_vector<uint8_t>(socket, value);
 }
 
 void MasterDaemon::_do_stop(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_stop";
   ReplyType value = ReplyType::STOP_WAIT;
-  _stop = true;
   tcputils::send_value<ReplyType>(socket, value);
+  if (--_nranks == 0) {
+    _stop = true;
+  }
 }
 
 void MasterDaemon::_do_wait(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_wait";
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
   auto reply = ReplyType::STOP_WAIT;
@@ -126,35 +136,47 @@ void MasterDaemon::run() {
     }
 
     for (size_t i = 1; i < fds.size(); i++) {
-      if (fds[i].revents == 0) {
-        continue;
-      }
-
-      Command command = tcputils::receive_value<Command>(fds[i].fd);
-      VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
-
-      switch (command) {
-        case Command::ADD:
-          _do_add(fds[i].fd);
-          break;
-        case Command::GET:
-          _do_get(fds[i].fd);
-          break;
-        case Command::WAIT:
-          _do_wait(fds[i].fd);
-          break;
-        case Command::STOP:
-          _do_stop(fds[i].fd);
-          break;
+      try {
+        if (fds[i].revents == 0) {
+          continue;
+        }
+
+        Command command = tcputils::receive_value<Command>(fds[i].fd);
+        VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command)
+                << ".";
+
+        switch (command) {
+          case Command::ADD:
+            _do_add(fds[i].fd);
+            break;
+          case Command::GET:
+            _do_get(fds[i].fd);
+            break;
+          case Command::SET:
+            _do_set(fds[i].fd);
+            break;
+          case Command::WAIT:
+            _do_wait(fds[i].fd);
+            break;
+          case Command::STOP:
+            _do_stop(fds[i].fd);
+            break;
+          default:
+            VLOG(0) << "Unknow command: " << static_cast<int>(command);
+            exit(-1);
+        }
+      } catch (...) {
+        fds.erase(fds.begin() + i);
+        _sockets.erase(_sockets.begin() + i - 1);
       }
     }
   }
 }
 
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
   int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
   auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket);
+  server->_master_daemon = MasterDaemon::start(socket, nranks);
   return server;
 }
 
@@ -200,7 +222,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
                    size_t num_workers, std::chrono::seconds timeout)
     : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
   if (_is_master) {
-    _server = detail::TCPServer::create(port);
+    _server = detail::TCPServer::create(port, num_workers);
   }
 
   _client = detail::TCPClient::connect(host, port);
@@ -213,36 +235,41 @@ void TCPStore::waitWorkers() {
   }
   add(_init_key, 1);
 
-  if (_server) {
-    auto begin = std::chrono::steady_clock::now();
-    do {
-      auto value = get(_init_key);
-      int completed = std::stoi(std::string(value.begin(), value.end()));
-      VLOG(3) << completed << " worker ready, total " << _num_workers;
-      if (completed >= _num_workers) {
-        break;
-      }
-      const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
-          std::chrono::steady_clock::now() - begin);
-
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
-        PADDLE_ENFORCE_EQ(
-            completed, _num_workers,
-            platform::errors::InvalidArgument(
-                "TCPStore timeouted and not all workers got ready."));
-      }
-    } while (true);
-  }
+  auto begin = std::chrono::steady_clock::now();
+  do {
+    auto value = get(_init_key);
+    int completed = std::stoi(std::string(value.begin(), value.end()));
+    VLOG(3) << completed << " worker ready, total " << _num_workers;
+    if (completed >= _num_workers) {
+      break;
+    }
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - begin);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
+      PADDLE_ENFORCE_EQ(
+          completed, _num_workers,
+          platform::errors::InvalidArgument(
+              "TCPStore timeouted and not all workers got ready."));
+    }
+  } while (true);
   VLOG(3) << "TCPStore initialized.";
 }
 
 int64_t TCPStore::add(const std::string& key, int64_t value) {
+  VLOG(3) << "TCPStore add.";
   _client->send_command_for_key(Command::ADD, _key_prefix + key);
   _client->send_value<std::int64_t>(value);
   return _client->receive_value<std::int64_t>();
 }
 
+void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
+  VLOG(3) << "TCPStore set.";
+  _client->send_command_for_key(Command::SET, _key_prefix + key);
+  _client->send_vector<std::uint8_t>(value);
+}
+
 std::vector<uint8_t> TCPStore::get(const std::string& key) {
   wait(key);
   _client->send_command_for_key(Command::GET, _key_prefix + key);
@@ -252,6 +279,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 
 void TCPStore::wait(const std::string& key) {
   ReplyType reply;
+  VLOG(3) << "TCPStore wait.";
   do {
     _client->send_command_for_key(Command::WAIT, _key_prefix + key);
 
@@ -261,6 +289,7 @@ void TCPStore::wait(const std::string& key) {
 }
 
 TCPStore::~TCPStore() {
+  VLOG(3) << "~TCPStore";
   _client->send_command_for_key(Command::STOP, "");
   ReplyType ret = _client->receive_value<ReplyType>();
   PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
index cd706dd6640acf5e0b5b3714175dac7a6cecb25a..17c1d8ea30a421f04d054d59ac93c8c60406ef68 100644
--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -27,15 +27,16 @@ namespace paddle {
 namespace distributed {
 
 enum class ReplyType { WAITING, STOP_WAIT };
-enum class Command { ADD, GET, WAIT, STOP };
+enum class Command { ADD, GET, SET, WAIT, STOP };
 
 namespace detail {
 
 class MasterDaemon {
  public:
-  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
+                                             int nranks);
   MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket);
+  explicit MasterDaemon(SocketType listen_socket, int nranks);
   ~MasterDaemon();
 
  private:
@@ -43,18 +44,20 @@ class MasterDaemon {
   void _do_add(SocketType socket);
   void _do_wait(SocketType socket);
   void _do_get(SocketType socket);
+  void _do_set(SocketType socket);
   void _do_stop(SocketType socket);
   SocketType _listen_socket;
   std::vector<SocketType> _sockets;
   std::unordered_map<std::string, std::vector<uint8_t>> _store;
   std::thread _background_thread{};
+  int _nranks;
   bool _stop = false;
 };
 
 class TCPServer {
  public:
   TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
 
  private:
   std::unique_ptr<MasterDaemon> _master_daemon;
@@ -97,6 +100,7 @@ class TCPStore : public Store {
   int64_t add(const std::string& key, int64_t value) override;
   std::vector<uint8_t> get(const std::string& key) override;
   void wait(const std::string& key) override;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
  private:
   void waitWorkers();
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
index d0561d0b9a9c5b01c32620e72d21ed562e42637e..a28cba288333d7f1c2a705049c29b59f43a70cc5 100644
--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
   hints.ai_socktype = SOCK_STREAM;
 
   const char* node = host.empty() ? nullptr : host.c_str();
+  const char* port_cstr = port.empty() ? nullptr : port.c_str();
 
   int n;
-  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
   const char* proto =
       (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 5e16ab2b391d0223a8b6fd9bae78cced9d4e2f11..f9d1b705390cb1c22bf9336292af30363c0010cf 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,8 +1,8 @@
-set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
-set(generated_deps dygraph_function dygraph_node)
+set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     message("Performing Eager Dygraph Auto Code Generation")
     add_subdirectory(auto_code_generator)
 endif()
@@ -10,11 +10,11 @@ endif()
 add_subdirectory(api)
 add_subdirectory(accumulation)
 
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
-cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
+cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 734cabdc3dc914349e2ad30b657bfb6542a7472a..07fa40165167ce2352018c0e1b1cb08222d5a181 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
   explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    VLOG(6) << "Construct GradNodeAccumulation";
     weak_grad_ = meta->WeakGrad();
     SetDefaultGradInOutMeta();
   }
 
-  ~GradNodeAccumulation() override = default;
+  ~GradNodeAccumulation() override {
+    VLOG(6) << "Destruct GradNodeAccumulation";
+  }
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt
index ebbef286f7923003295224a38c56c50eb3fa9c5a..4f634c6884b45a83f09348d5cc4749e6272b2a51 100644
--- a/paddle/fluid/eager/api/generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(eager_generated)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_subdirectory(fluid_generated)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index 77d8ec57efcaa6c4e83a69f4b2a97b128b174389..81ff07b8963f97b8c257e0204c4cdcc0fc82ea63 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info)
 
-if(NOT ON_INFER)
+if(NOT (NOT WITH_PYTHON AND ON_INFER))
 cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
 add_dependencies(final_dygraph_node eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index c0150a1730d52b3410ba4ea0d31674fbfed596ae..247fde6ed1f869542969b068cdae9f59cedd732a 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
       const std::vector<paddle::experimental::Tensor>& tensors);
 
   void SetAttributes_scale(float scale);
-
+  std::string name() override { return ""; }
   // Members: define fwd input tensors
   // For Scale there is no fwd input tensor needed
  private:
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 60b35340eabd1fa03f59cc0b7ea278351be96df1..c70bb80c35c78ca476c8612d804bdd1e9b3838ff 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node)
 
-if(NOT ON_INFER)
+if(NOT (NOT WITH_PYTHON AND ON_INFER))
 cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
 add_dependencies(final_dygraph_function eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index c7927716300528fdfa571de720ce12e7246b5f1d..9abd7be49d44cbab4b3482961df461dd7164328f 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -52,49 +52,44 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
   }
 }
 
-static void RetainGradForRegularNode(
-    const paddle::experimental::Tensor& tensor) {
-  AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
-  if (meta->RetainGrads()) {
+void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
+  if (IsLeafTensor(tensor)) {
+    // Leaf tensor's grad will always be retained
+    // Refer to implementation of AccumulationNode for more details
     return;
   } else {
-    meta->SetRetainGrads(true);
-  }
+    AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
+    if (meta->RetainGrads()) {
+      return;
+    } else {
+      meta->SetRetainGrads(true);
+    }
 
-  std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
-      meta->WeakGrad();
+    std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
+        meta->WeakGrad();
 
-  // Define Hook
-  auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
-    if (!weak_grad_tensor.expired()) {
-      auto grad_tensor = weak_grad_tensor.lock();
-      if (t.defined()) {
-        VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
-        // Simply Copy impl() to grad_tensor
-        grad_tensor->set_impl(t.impl());
-        return *grad_tensor.get();
+    // Define Hook
+    auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
+      if (!weak_grad_tensor.expired()) {
+        auto grad_tensor = weak_grad_tensor.lock();
+        if (t.defined()) {
+          VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
+          // Simply Copy impl() to grad_tensor
+          grad_tensor->set_impl(t.impl());
+          return *grad_tensor.get();
+        } else {
+          VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+          return paddle::experimental::Tensor();
+        }
       } else {
         VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
         return paddle::experimental::Tensor();
       }
-    } else {
-      VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
-      return paddle::experimental::Tensor();
-    }
-  };
+    };
 
-  // Append to GradientHooks
-  RegisterGradientHookForTensor(tensor,
-                                std::make_shared<egr::CppTensorHook>(hook));
-}
-
-void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
-  if (IsLeafTensor(tensor)) {
-    // Leaf tensor's grad will always be retained
-    // Refer to implementation of AccumulationNode for more details
-    return;
-  } else {
-    RetainGradForRegularNode(tensor);
+    // Append to GradientHooks
+    RegisterGradientHookForTensor(tensor,
+                                  std::make_shared<egr::CppTensorHook>(hook));
   }
 }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index a8e0ed7a41a043e12332ad347f673a6c27e5f1ec..dc79a8a45a246798551a0bcce8c487f67183220b 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
+/* --- Black Ops list that's NO NEED to apply code generation --- */
+static std::unordered_set<std::string> black_ops_list = {"run_program"};
+
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
   std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
@@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type,
 }
 
 static void PrepareAttrMapForOps() {
-  // Handle "run_program_op"
-  static framework::ProgramDesc fake_prog;
-  operators_with_attrs["run_program"] = {};
-  operators_with_attrs["run_program"]["global_block"] =
-      fake_prog.MutableBlock(0);
-
   // Handle "fused_elemwise_add_activation"
   std::vector<std::string> functor_list = {"a", "b"};
   operators_with_attrs["fused_elemwise_add_activation"] = {};
@@ -996,6 +993,29 @@ static std::string GenerateGradNodeCreationContent(
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
   std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  // If single output slotname and not duplicable,
+  // then generate: "egr::AutogradMeta* p_autograd_out =
+  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    const std::string& output_autograd_name = "p_autograd_" + output_name;
+
+    if (output.duplicable()) {
+      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
+          "  std::vector<egr::AutogradMeta*> %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    } else {
+      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+          "  egr::AutogradMeta* %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    }
+  }
+  VLOG(6) << "Generated outputs autograd_meta";
+
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,31 +1044,6 @@ static std::string GenerateGradNodeCreationContent(
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
-  // If single output slotname and not duplicable,
-  // then generate: "egr::AutogradMeta* p_autograd_out =
-  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : out_vars) {
-    const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-
-    // Skip Intermediate Tensor
-
-    if (output.duplicable()) {
-      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    }
-  }
-  VLOG(6) << "Generated outputs autograd_meta";
-
   std::string prepare_autograd_meta_str = "";
   prepare_autograd_meta_str += get_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
@@ -1156,11 +1151,13 @@ static std::string GenerateGradNodeCreationContent(
       grad_node_creation_str += paddle::string::Sprintf(
           SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
 
-      const char* SET_HISTORY_TEMPLATE =
-          "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
-
+      // Intermediate Tensor does not require SetHistory
+      if (!output.intermediate()) {
+        const char* SET_HISTORY_TEMPLATE =
+            "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+      }
       const char* SET_GRAD_IN_META_TEMPLATE =
           "    grad_node->SetGradInMeta(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
@@ -1173,17 +1170,20 @@ static std::string GenerateGradNodeCreationContent(
       grad_node_creation_str += paddle::string::Sprintf(
           SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
 
-      const char* SET_HISTORY_TEMPLATE =
-          "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
-
+      // Intermediate Tensor does not require SetHistory
+      if (!output.intermediate()) {
+        const char* SET_HISTORY_TEMPLATE =
+            "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+      }
       const char* SET_GRAD_IN_META_TEMPLATE =
           "    grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
     }
 
+    // Intermediate Tensor does not require CheckAndRetainGrad
     if (!output.intermediate()) {
       VLOG(6) << "Generated Call RetainGradForTensor";
       const char* RETAIN_GRAD_TEMPLATE =
@@ -1199,11 +1199,12 @@ static std::string GenerateGradNodeCreationContent(
       "  %s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
       "  if(require_any_grad) {\n"
+      "    VLOG(6) << \" Construct Grad for %s \"; \n"
       "    egr::EagerUtils::PassStopGradient(%s);\n"
       "%s\n  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, pass_stop_gradient_args,
+      compute_require_grad_args, op_type, pass_stop_gradient_args,
       grad_node_creation_str);
 
   return grad_node_creation_body_str;
@@ -2078,22 +2079,24 @@ static std::string GenerateGradNodeHeaderContents(
   const char* GRAD_NODE_TEMPLATE =
       "class GradNode%s : public egr::GradNodeBase {\n"
       " public:\n"
-      "  GradNode%s() : egr::GradNodeBase() {}\n"
+      "  GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
+      "GradNode%s \"; }\n"
       "  GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
-      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n"
-      "  ~GradNode%s() override = default;\n"
+      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
+      "Construct GradNode%s \"; }\n"
+      "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
       "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
       "override;\n"
       "\n"
+      "  std::string name() override { return \" GradNode%s \"; } \n "
+      "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
-      "  std::string name() { return \"GradNode%s\"; }\n"
-      "\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
@@ -2190,8 +2193,8 @@ static std::string GenerateGradNodeHeaderContents(
   VLOG(6) << "Generated TensorWrapper";
 
   std::string grad_node_str = paddle::string::Sprintf(
-      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
+      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
       tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
@@ -2343,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     if (!CheckOpProto(op_proto)) continue;
     const std::string& op_type = op_proto->type();
+    if (black_ops_list.count(op_type)) {
+      continue;
+    }
 
     /* ----------------------------- */
     /* ---- Collect Information ---- */
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index c6bca01205e19c58d5924f4e9d60bb76164fee2b..53af6c1048d2454b1e9f375b837103930026ae54 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
+set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
+set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
 set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index c6e56e34627a52bc19df7e8d87371811fcec8697..967891fe5227dcd6129c0ef1808fba7720711568 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -23,6 +23,20 @@ core_ops_returns_info = {}
 core_ops_args_info = {}
 core_ops_args_type_info = {}
 
+namespace = ""
+
+yaml_types_mapping = {
+    'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
+    'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+    'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
+    'Tensor' : 'Tensor',
+    'Tensor[]' : 'std::vector<Tensor>',
+    'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
+    'Scalar' : 'paddle::experimental::Scalar',
+    'ScalarArray' : 'paddle::experimental::ScalarArray'
+}
+
 
 def ParseArguments():
     parser = argparse.ArgumentParser(
@@ -59,7 +73,9 @@ def IsPlainTensorType(string):
 
 
 def IsVectorTensorType(string):
-    vector_tensor_types = ['list(Tensor)']
+    vector_tensor_types = [
+        'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
+    ]
     if string in vector_tensor_types:
         return True
     return False
@@ -110,6 +126,7 @@ def GetAutoGradMetaVectorName(string):
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
     contents = yaml.load(f, Loader=yaml.FullLoader)
+    f.close()
     return contents
 
 
@@ -118,9 +135,13 @@ def ReadBwdFile(filepath):
     contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
     for content in contents:
-        assert 'backward_api' in content.keys()
-        api_name = content['backward_api']
+        if 'backward_api' in content.keys():
+            api_name = content['backward_api']
+        else:
+            assert False
+
         ret[api_name] = content
+    f.close()
     return ret
 
 
@@ -180,6 +201,9 @@ def ParseYamlArgs(string):
         arg_name = m.group(3).split("=")[0].strip()
         default_value = m.group(3).split("=")[1].strip() if len(
             m.group(3).split("=")) > 1 else None
+
+        assert arg_type in yaml_types_mapping.keys()
+        arg_type = yaml_types_mapping[arg_type]
         if "Tensor" in arg_type:
             assert default_value is None
             inputs_list.append([arg_name, arg_type, i])
@@ -190,35 +214,30 @@ def ParseYamlArgs(string):
 
 
 def ParseYamlReturns(string):
-    # Example: Tensor, Tensor
+    # Example0: Tensor(out), Tensor(out1)
+    # Example1: Tensor, Tensor
+    # Example2: Tensor[](out), Tensor
 
-    # list = [ ["", ret_type, orig_position], ...]
+    # list = [ [ret_name, ret_type, orig_position], ...]
     returns_list = []
 
     returns = [x.strip() for x in string.strip().split(",")]
+
     for i in range(len(returns)):
         ret = returns[i]
-        returns_list.append(["", ret, i])
-
-    return returns_list
 
+        ret_name = ""
+        if "(" in ret and ")" in ret:
+            # Remove trailing ')'
+            ret = ret[:-1]
+            ret_type = ret.split("(")[0].strip()
+            ret_name = ret.split("(")[1].strip()
+        else:
+            ret_type = ret.strip()
 
-def ParseYamlReturnsWithName(string):
-    # Example: Tensor(out), Tensor(out1)
-
-    # list = [ [ret_name, ret_type, orig_position], ...]
-    returns_list = []
-
-    returns = [x.strip() for x in string.strip().split(",")]
+        assert ret_type in yaml_types_mapping.keys()
+        ret_type = yaml_types_mapping[ret_type]
 
-    atype = r'(.*?)'
-    aname = r'(.*?)'
-    pattern = f'{atype}\({aname}\)'
-    for i in range(len(returns)):
-        ret = returns[i]
-        m = re.search(pattern, ret)
-        ret_type = m.group(1)
-        ret_name = m.group(2)
         assert "Tensor" in ret_type
         returns_list.append([ret_name, ret_type, i])
 
@@ -240,7 +259,7 @@ def ParseYamlForwardFromBackward(string):
     function_returns = m.group(3)
 
     forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
-    forward_returns_list = ParseYamlReturnsWithName(function_returns)
+    forward_returns_list = ParseYamlReturns(function_returns)
 
     return forward_inputs_list, forward_attrs_list, forward_returns_list
 
@@ -270,7 +289,7 @@ def ParseYamlBackward(args_str, returns_str):
     args_str = re.search(args_pattern, args_str).group(1)
 
     inputs_list, attrs_list = ParseYamlArgs(args_str)
-    returns_list = ParseYamlReturnsWithName(returns_str)
+    returns_list = ParseYamlReturns(returns_str)
 
     return inputs_list, attrs_list, returns_list
 
@@ -496,11 +515,18 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
         set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format(
             aname, GetConstReference(atype), aname, saved_attr_name, aname)
 
-        ATTRIBUTE_MEMBER_TEMPLATE = """
-   {} {} = {};
-"""
-        attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
-            RemoveConstAndReference(atype), saved_attr_name, default_val)
+        if default_val:
+            ATTRIBUTE_MEMBER_TEMPLATE = """
+       {} {} = {};
+    """
+            attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
+                RemoveConstAndReference(atype), saved_attr_name, default_val)
+        else:
+            ATTRIBUTE_MEMBER_TEMPLATE = """
+       {} {};
+    """
+            attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
+                RemoveConstAndReference(atype), saved_attr_name)
     # End: SetAttributes & Attribute Members
 
     grad_node_name = GetGradNodeName(fwd_api_name)
@@ -514,7 +540,7 @@ class {} : public egr::GradNodeBase {{
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
-  
+  std::string name() override {{ return \" {} \"; }}
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
@@ -529,8 +555,9 @@ class {} : public egr::GradNodeBase {{
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        set_tensor_wrapper_methods_str, set_attribute_methods_str,
-        tensor_wrapper_members_str, attribute_members_str)
+        grad_node_name, set_tensor_wrapper_methods_str,
+        set_attribute_methods_str, tensor_wrapper_members_str,
+        attribute_members_str)
 
     return node_declaration_str
 
@@ -587,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
+
+    if len(namespace) > 0:
+        grad_api_namespace = f"paddle::experimental::{namespace}"
+    else:
+        grad_api_namespace = f"paddle::experimental"
+
     FUNCTION_TEMPLATE = """
 std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
     // Call grad_api function
-    auto grad_api_returns = paddle::experimental::{}({});
+    auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
+        grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
+        returns_str)
 
     return node_definition_str
 
@@ -650,7 +684,7 @@ def GenerateNodeCreationCodes(
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                outputs_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
             else:
                 assert IsVectorTensorType(rtype)
                 output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
@@ -678,18 +712,24 @@ def GenerateNodeCreationCodes(
 
     # SetTensorWrappers
     set_tensor_wrappers_list = []
-    for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+    for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
         is_optional = (name in optional_inputs)
+
         if is_fwd_input:
             if is_optional:
                 set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
             else:
                 set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
+            if IsVectorTensorType(atype):
+                tw_name = f"api_result[{pos}]"
+            else:
+                tw_name = f"api_result"
+
             if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -829,7 +869,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         function_name = fwd_api_name
     else:
         function_name = fwd_api_name + "_intermediate"
-    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
+
+    if len(namespace) > 0:
+        forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
+    else:
+        forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
     num_outputs = len(forward_outputs_position_map.keys()) - len(
@@ -979,7 +1023,9 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1000,11 +1046,12 @@ def GenerateNodeHFile(filepath, node_declaration_str):
 
 def GenerateForwardCCFile(filepath, forward_definition_str):
     file_contents = """
+#include "paddle/phi/api/lib/dygraph_api.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
-
 """
 
     file_contents += GenerateCoreOpInfoDefinition()
@@ -1021,6 +1068,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 #include "paddle/phi/api/all.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 
 """
     file_contents += GenerateCoreOpInfoDeclaration()
@@ -1032,134 +1080,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    backward_yaml_path = args.backward_yaml_path
-
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-    grad_api_dict = ReadBwdFile(backward_yaml_path)
+    api_yaml_paths = args.api_yaml_path.split(",")
+    backward_yaml_paths = args.backward_yaml_path.split(",")
 
     # Generate per Dygraph API
     node_declaration_str = ""
     node_definition_str = ""
     forward_definition_str = ""
     forward_declaration_str = ""
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
 
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        no_need_buffer_set = set()
-        if 'no_need_buffer' in fwd_api.keys():
-            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        bwd_api_name = fwd_api['backward']
-        assert bwd_api_name in grad_api_dict.keys()
-        bwd_api = grad_api_dict[bwd_api_name]
-
-        assert 'args' in bwd_api.keys()
-        assert 'output' in bwd_api.keys()
-        assert 'forward' in bwd_api.keys()
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        bwd_forward_str = bwd_api['forward']
-        bwd_args_str = bwd_api['args']
-        bwd_returns_str = bwd_api['output']
-
-        # Collect Forward Inputs/Outputs
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
-            bwd_forward_str)
-        print("Parsed Forward Inputs List: ", forward_inputs_list)
-        print("Prased Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Forward Returns List: ", forward_returns_list)
-
-        intermediate_outputs = []
-        if 'intermediate' in fwd_api.keys():
-            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
-
-        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
-        print("Parsed Original Forward Returns List: ",
-              orig_forward_returns_list)
-
-        # Forward Validation Checks
-        ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
-                                forward_returns_list, orig_forward_inputs_list,
-                                orig_forward_attrs_list,
-                                orig_forward_returns_list)
-
-        # Parse Backward Inputs/Outputs
-        backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
-            bwd_args_str, bwd_returns_str)
-        print("Parsed Backward Inputs List: ", backward_inputs_list)
-        print("Prased Backward Attrs List: ", backward_attrs_list)
-        print("Parsed Backward Returns List: ", backward_returns_list)
-
-        # Determine Forward Inputs/Outputs Position
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        # SlotName Matching
-        backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
-            backward_inputs_list, backward_returns_list,
-            forward_inputs_position_map, forward_outputs_position_map)
-        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
-        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
-        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
-
-        # Backward Validation Check
-        BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
-                                backward_attrs_list)
-
-        # Node Declaration Generation
-        node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
-            no_need_buffer_set)
-        print("Generated Node Declaration: ", node_declaration_str)
-
-        node_definition_str += GenerateNodeDefinition(
-            fwd_api_name, bwd_api_name, backward_fwd_input_map,
-            backward_grad_input_map, backward_grad_output_map,
-            backward_attrs_list)
-        print("Generated Node Definition: ", node_definition_str)
-
-        # Node Definition Generation
-        definition_declaration_pair = GenerateForwardDefinition(
-            fwd_api_name, bwd_api_name, forward_inputs_position_map,
-            forward_outputs_position_map, forward_attrs_list,
-            backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list, optional_inputs,
-            intermediate_outputs)
-        print("Generated Forward Definition: ", forward_definition_str)
-        print("Generated Forward Declaration: ", forward_declaration_str)
-        forward_definition_str += definition_declaration_pair[0]
-        forward_declaration_str += definition_declaration_pair[1]
-
-        # For python-level API dispatch
-        CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
-                                  forward_outputs_position_map,
-                                  forward_attrs_list)
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+        backward_yaml_path = backward_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            assert "sparse" in backward_yaml_path
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+        grad_api_dict = ReadBwdFile(backward_yaml_path)
+
+        yaml_forward_definition_str = ""
+        yaml_forward_declaration_str = ""
+        yaml_node_declaration_str = ""
+        yaml_node_definition_str = ""
+        for fwd_api in fwd_api_list:
+            # We only generate Ops with grad
+            if 'backward' not in fwd_api.keys():
+                continue
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+            assert 'backward' in fwd_api.keys()
+
+            no_need_buffer_set = set()
+            if 'no_need_buffer' in fwd_api.keys():
+                no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
+                    'no_need_buffer'])
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            bwd_api_name = fwd_api['backward']
+            assert bwd_api_name in grad_api_dict.keys()
+            bwd_api = grad_api_dict[bwd_api_name]
+
+            assert 'args' in bwd_api.keys()
+            assert 'output' in bwd_api.keys()
+            assert 'forward' in bwd_api.keys()
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            bwd_forward_str = bwd_api['forward']
+            bwd_args_str = bwd_api['args']
+            bwd_returns_str = bwd_api['output']
+
+            # Collect Forward Inputs/Outputs
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
+                bwd_forward_str)
+            print("Parsed Forward Inputs List: ", forward_inputs_list)
+            print("Prased Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Forward Returns List: ", forward_returns_list)
+
+            intermediate_outputs = []
+            if 'intermediate' in fwd_api.keys():
+                intermediate_outputs = ParseIntermediate(fwd_api[
+                    'intermediate'])
+
+            IntermediateValidationCheck(intermediate_outputs,
+                                        forward_returns_list)
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ",
+                  orig_forward_inputs_list)
+            print("Prased Original Forward Attrs List: ",
+                  orig_forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  orig_forward_returns_list)
+
+            # Forward Validation Checks
+            ForwardsValidationCheck(
+                forward_inputs_list, forward_attrs_list, forward_returns_list,
+                orig_forward_inputs_list, orig_forward_attrs_list,
+                orig_forward_returns_list)
+
+            # Parse Backward Inputs/Outputs
+            backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
+                bwd_args_str, bwd_returns_str)
+            print("Parsed Backward Inputs List: ", backward_inputs_list)
+            print("Prased Backward Attrs List: ", backward_attrs_list)
+            print("Parsed Backward Returns List: ", backward_returns_list)
+
+            # Determine Forward Inputs/Outputs Position
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            # SlotName Matching
+            backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
+                backward_inputs_list, backward_returns_list,
+                forward_inputs_position_map, forward_outputs_position_map)
+            print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
+            print("Generated Backward Grad Input Map: ",
+                  backward_grad_input_map)
+            print("Generated Backward Grad Output Map: ",
+                  backward_grad_output_map)
+
+            # Backward Validation Check
+            BackwardValidationCheck(backward_fwd_input_map,
+                                    backward_grad_input_map,
+                                    backward_attrs_list)
+
+            # Node Declaration Generation
+            yaml_node_declaration_str += GenerateNodeDeclaration(
+                fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+                no_need_buffer_set)
+            print("Generated Node Declaration: ", node_declaration_str)
+
+            yaml_node_definition_str += GenerateNodeDefinition(
+                fwd_api_name, bwd_api_name, backward_fwd_input_map,
+                backward_grad_input_map, backward_grad_output_map,
+                backward_attrs_list)
+            print("Generated Node Definition: ", node_definition_str)
+
+            # Node Definition Generation
+            definition_declaration_pair = GenerateForwardDefinition(
+                fwd_api_name, bwd_api_name, forward_inputs_position_map,
+                forward_outputs_position_map, forward_attrs_list,
+                backward_fwd_input_map, backward_grad_input_map,
+                backward_grad_output_map, backward_attrs_list, optional_inputs,
+                intermediate_outputs)
+            print("Generated Forward Definition: ", forward_definition_str)
+            print("Generated Forward Declaration: ", forward_declaration_str)
+            yaml_forward_definition_str += definition_declaration_pair[0]
+            yaml_forward_declaration_str += definition_declaration_pair[1]
+
+            # For python-level API dispatch
+            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
+                                      forward_outputs_position_map,
+                                      forward_attrs_list)
+
+        if len(namespace) > 0:
+            forward_definition_str += f"""namespace {namespace} {{
+    {yaml_forward_definition_str}
+}}
+"""
+
+            forward_declaration_str += f"""namespace {namespace} {{
+    {yaml_forward_declaration_str}
+}}
+"""
+
+            node_declaration_str += f"""namespace {namespace} {{
+    {yaml_node_declaration_str}
+}}
+"""
+
+            node_definition_str += f"""namespace {namespace} {{
+    {yaml_node_definition_str}
+}}
+"""
+
+        else:
+            forward_definition_str += yaml_forward_definition_str
+            forward_declaration_str += yaml_forward_declaration_str
+            node_declaration_str += yaml_node_declaration_str
+            node_definition_str += yaml_node_definition_str
 
     # Generate Files
     nodes_h_path = args.nodes_h_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 9329dc5ffc9dd0faa36b8ff6a8373387bc2678c7..eee32a2c5057d523212a4faa5eca8678e961f417 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,34 +14,28 @@
 
 import os
 import argparse
-from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+
+skipped_fwd_api_names = set(["scale"])
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
     "int": "CastPyArg2Int",
     "long": "CastPyArg2Long",
+    "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
     "string": "CastPyArg2String",
-    "bool[]": "CastPyArg2Booleans",
-    "int[]": "CastPyArg2Ints",
-    "long[]": "CastPyArg2Longs",
-    "float[]": "CastPyArg2Floats",
-    "double[]": "CastPyArg2Float64s",
-    "string[]": "CastPyArg2Strings"
-}
-
-atype_to_cxx_type = {
-    "bool": "bool",
-    "int": "int",
-    "long": "long",
-    "float": "float",
-    "string": "std::string",
-    "bool[]": "std::vector<bool>",
-    "int[]": "std::vector<int>",
-    "long[]": "std::vector<long>",
-    "float[]": "std::vector<float>",
-    "double[]": "std::vector<double>",
-    "string[]": "std::vector<std::string>"
+    "std::vector<bool>": "CastPyArg2Booleans",
+    "std::vector<int>": "CastPyArg2Ints",
+    "std::vector<long>": "CastPyArg2Longs",
+    "std::vector<int64_t>": "CastPyArg2Longs",
+    "std::vector<float>": "CastPyArg2Floats",
+    "std::vector<double>": "CastPyArg2Float64s",
+    "std::vector<std::string>": "CastPyArg2Strings",
+    "paddle::experimental::Scalar": "CastPyArg2Scalar",
+    "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
+    "paddle::experimental::Backend": "CastPyArg2Backend",
+    "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
 
@@ -55,15 +49,9 @@ def ParseArguments():
     return args
 
 
-def GetCxxType(atype):
-    if atype not in atype_to_cxx_type.keys():
-        assert False
-
-    return atype_to_cxx_type[atype]
-
-
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
+        print(f"Unable to find {atype} in atype_to_parsing_function.")
         assert False
 
     return atype_to_parsing_function[atype]
@@ -71,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype):
 
 def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
                             forward_attrs_list, forward_outputs_position_map,
-                            optional_inputs):
+                            optional_inputs, is_forward_only):
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
     # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
@@ -98,11 +86,10 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
     # Get Attributes
     for name, atype, _, pos in forward_attrs_list:
         parsing_function = FindParsingFunctionFromAttributeType(atype)
-        cxx_type = GetCxxType(atype)
         key = f"{name}"
 
         parse_attributes_str += f"    PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n"
-        parse_attributes_str += f"    {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
+        parse_attributes_str += f"    {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
 
         dygraph_function_call_list[pos] = f"{name}"
     dygraph_function_call_str = ",".join(dygraph_function_call_list)
@@ -139,11 +126,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
 }}
 
 """
+    namespace_str = ""
+    if len(namespace) > 0:
+        namespace_str = f"{namespace}::"
+
+    if is_forward_only:
+        fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
+    else:
+        fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
+
     python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
         fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
-        GetForwardFunctionName(fwd_api_name), dygraph_function_call_str)
+        fwd_function_name, dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
+    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
     return python_c_function_str, python_c_function_reg_str
 
@@ -197,7 +193,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
     """
 
     core_ops_infos_registry = """
-    ,{\"get_final_state_core_ops_args_info\",
+    {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
@@ -225,6 +221,13 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
 #pragma once
 
 #include  "pybind11/detail/common.h"
+#include  "paddle/phi/api/all.h"
+#include  "paddle/phi/api/lib/dygraph_api.h"
+#include  "paddle/phi/common/backend.h"
+#include  "paddle/phi/common/data_type.h"
+#include  "paddle/phi/common/scalar.h"
+#include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
 #include  "paddle/fluid/pybind/op_function_common.h"
 #include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include  "paddle/fluid/pybind/exception.h"
@@ -257,53 +260,80 @@ def GeneratePythonCFile(filepath, python_c_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-
-    python_c_function_list = []
-    python_c_function_reg_list = []
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
-
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Original Forward Returns List: ", forward_returns_list)
-
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-            fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map, optional_inputs)
-        python_c_function_list.append(python_c_function_str)
-        python_c_function_reg_list.append(python_c_function_reg_str)
-        print("Generated Python-C Function: ", python_c_function_str)
-
-    python_c_functions_str = "\n".join(python_c_function_list)
-    python_c_functions_reg_str = ",\n".join(python_c_function_reg_list)
+    api_yaml_paths = args.api_yaml_path.split(",")
+
+    python_c_functions_reg_str = ""
+    python_c_functions_str = ""
+
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+
+        python_c_function_list = []
+        python_c_function_reg_list = []
+        for fwd_api in fwd_api_list:
+
+            # We only generate Ops with grad
+            is_forward_only = False
+            if 'backward' not in fwd_api.keys():
+                is_forward_only = True
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            if fwd_api_name in skipped_fwd_api_names:
+                continue
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ", forward_inputs_list)
+            print("Prased Original Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  forward_returns_list)
+
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
+                fwd_api_name, forward_inputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, optional_inputs, is_forward_only)
+            python_c_function_list.append(python_c_function_str)
+            python_c_function_reg_list.append(python_c_function_reg_str)
+            print("Generated Python-C Function: ", python_c_function_str)
+
+        # Append Namespace
+        python_c_functions_reg_str += ",\n".join(
+            python_c_function_reg_list) + ","
+        python_c_functions = "\n".join(python_c_function_list)
+        if len(namespace) > 0:
+            python_c_functions_str += f"""namespace {namespace} {{
+    {python_c_functions}
+}}
+"""
+
+        else:
+            python_c_functions_str += python_c_functions
 
     python_c_str = GeneratePythonCWrappers(python_c_functions_str,
                                            python_c_functions_reg_str)
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3..dca76d3b8a0db8c4284960005bfbad33ce23e20d 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta {
  private:
   // TODO(jiabin) :Should we use pointer instead of object?
   std::shared_ptr<paddle::experimental::Tensor> grad_{
-      std::make_shared<paddle::experimental::Tensor>(
-          egr::Controller::Instance().GenerateUniqueName("@grad"))};
+      std::make_shared<paddle::experimental::Tensor>()};
 
   // GradNodeBase is base class of all grad op which is a
   // wrapper for grad op. This class will make grad op easy
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 7073ca8f0527ba8237da734db0c8724baa2a49ec..934497d7d179c1732bde68c147ed86661c25ddae 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
     }
     visited.insert(node);
 
+    PADDLE_ENFORCE_NOT_NULL(
+        node,
+        paddle::platform::errors::Fatal(
+            "We got null node when we traverse the backward graph, and this "
+            "should not happened please check your code and contact us."));
     // Find and append next nodes
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
     for (const auto& edge_list : edges) {
       for (const Edge& edge : edge_list) {
         GradNodeBase* next_node = edge.GetMutableGradNode().get();
-
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
       }
     }
   }
-
   return node_in_degree_map;
 }
 
@@ -221,10 +224,11 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
                 << " 's name is: " << grad_output_tensor.name();
 
         auto* next_node = next_node_shared.get();
-
         if (!node_input_buffers_dict.count(next_node)) {
-          node_input_buffers_dict[next_node] =
-              std::make_unique<GradTensorHolder>(next_node->InputMeta());
+          const auto& input_meta = next_node->InputMeta();
+          auto grad_tensor_holder =
+              std::make_unique<GradTensorHolder>(input_meta);
+          node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
         VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
                 << ", rank: " << edge_rank.second;
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 35416281f188892ec11413a19abad9b3e5c29e76..427be83c3bbee31eaa0c7e3d26d2d9599b344450 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -30,6 +30,7 @@
 namespace egr {
 
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
+  VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
   // adj_edges has the same num as backward outputs
@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node) {
+      if (node && node.get()) {
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       }
@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "inputs's slot num."));
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node) {
+    if (node && node.get()) {
       VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
               << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
@@ -244,7 +249,7 @@ GradNodeBase::ApplyGradientHooks(
     if (!out.defined() || !out.initialized()) {
       out = (*hook)(tensors[slot_id][rank]);
     } else {
-      // If more than one hook is registered, the input to the next hook func
+      // If more than one hook is registered, the input to the next hook func
       // should be the output of the previous hook
       out = (*hook)(out);
     }
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index eeac1cca4acf33190ce30613e4a86e99a95b651b..16513f05e0777a8e57f54c925d68867dda656612 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -76,10 +76,10 @@ class GradSlotMeta {
 
 class GradNodeBase {
  public:
-  GradNodeBase() = default;
+  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
   // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() = default;
+  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
 
   /**
    * operator() designed to contian the real backward execution logic, it should
diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt
index c1506d8139b432c93d0bed35073b404192a927f6..2bfb9937c8c9167d712535dca71ef02efa1f3f78 100644
--- a/paddle/fluid/eager/tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(data_structure_tests)
 add_subdirectory(task_tests)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_subdirectory(performance_tests)
 endif()
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index bb84e2dda81bafe624fe7734a0a47391eeb0adfa..535c93ac53b1751d9634476e47f32dc0cbe22708 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode(float val, int in_num, int out_num)
       : GradNodeBase(in_num, out_num), val_(val) {}
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
+  std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
       override {
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 8c6eeca9d3d5d80fd5bfe943ef87ba8640ada4f2..384fdcd6f97c4b318341db68cdd88b644d42d22a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -24,6 +24,8 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
+
 // TODO(jiabin): remove nolint here!!!
 using namespace egr;  // NOLINT
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 6c4bf9a4f17e6f88503f0a1d6ec2f3029000b6f0..adb3246ee8c808c9f62fde0228f40cccb2f9ac88 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -33,6 +33,14 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
@@ -72,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCPU) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  for (const std::string& mode : {"Accuracy", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cpu.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 14e7ce8cfcfb4dea0907cd128873223c8e5859a2..bd70e84d9b461490f53ac6692d55860da1bfc9d8 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -32,11 +32,19 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+
 TEST(Benchmark, EagerScaleCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
@@ -74,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCUDA) {
+  paddle::platform::CUDAPlace place;
+  eager_test::InitEnv(place);
+
+  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "WarmUp") {
+      benchmark_eager_matmul(X, Y);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cuda.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
   eager_test::InitEnv(place);
@@ -186,7 +238,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 3292de9363696dae30d853980eca6fb1ba1055cc..a9d297c1c64f7b64373237a0500802a5c883aedd 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -34,6 +34,14 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index e9b7d10070dbf22f10e617d34f143992d19fb659..bd9eaa09ca9a406da943c8a0b0f37b674d5ea3c2 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -34,8 +34,16 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
@@ -248,7 +256,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 96126fa5466aace442dfb742f9902539916b853e..769bd7f687f4584d44bbfa30b73611a3128289bf 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/utils.h"
 
 // Eager Generated
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 
 // Fluid
@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 }
 
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
+                            bool accuracy_check) {
+  paddle::experimental::Tensor input_tensor0 = X;
+
+  size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
+  for (size_t i = 0; i < max_num_runs; i++) {
+    input_tensor0 =
+        matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
+  }
+
+  std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
+  RunBackward(target_tensors, {});
+
+  if (accuracy_check) {
+    // Examine Forward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareTensorWithValue<float>(input_tensor0, 16);
+    // Examine Backward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareGradTensorWithValue<float>(X, 16);
+    eager_test::CompareGradTensorWithValue<float>(Y, 16);
+  }
+}
+
 /* ----------------------------------- */
 /* ---- Eager Intermediate Matmul ---- */
 /* ----------------------------------- */
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 0086b51b57e152c6da935eacba8d93c0d6ab1a71..86bf13707ed40b0c37ccb54695cca3d165768cb6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
                            bool accuracy_check = false);
 
 /* ---- Eager MatMul ---- */
-/*
-void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const
-paddle::experimental::Tensor& Y,
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
                             bool accuracy_check = false);
-void benchmark_eager_mlp(const paddle::experimental::Tensor& X,
-                         const std::vector<paddle::experimental::Tensor>& Ws,
-                         const std::vector<paddle::experimental::Tensor>& Bs,
-                         bool accuracy_check = false);
-*/
+
 void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
                                          const paddle::experimental::Tensor& Y,
                                          bool accuracy_check = false);
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index dbdb52eb53655201ac06b1362c9776ba98bba3eb..c65ad4641cf2206cc0f97d91f1fb24e50b7b63cd 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
     cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
 endif()
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index a4bc56bd606f3fbb0f9152d58acb5c8edeecf905..0c894ed267fcdd08d44d4df08bfaf0554874aebf 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,6 +30,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Backward, SingleNodeEmptyGrad) {
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 524872b2e55638d25697388aa50724f49f6e3818..36594f1aac8cdb131bb77f1396dca19a0c2e8cc0 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -31,6 +31,10 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(CrossBatchAccumulation, SingleScaleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 49bbfc77741a5b82ac9a564e25b484e5dabf77a7..dc44d95daac1d9109bbf2a1d04a8a47b081cead9 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -27,6 +27,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Forward, SingleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 5a7bafb2fe37051c0ad054c130d77dd6e05319d2..f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -30,6 +30,13 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+#endif
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 4b7077b13bdd6c48a0a3846656bd3a6337eb9f80..2a5ad53204a6201149bec0b3dac0fa3baf441f2e 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -30,6 +30,12 @@
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Generated, Sigmoid) {
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 9cda961741f55e9b4b7fc8dac61fe4a7c96567cf..d546df4ed087a99a28096a5336fab3826991534a 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -31,6 +31,10 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 15b2a62dca751859882e82d46acaa46f27c2c518..56813c498d2410caa452da7a334c393b230c65bf 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -27,6 +27,12 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index ea821d195099f3d632e0d1b2d4937bac812563c8..24e5da060111f083ef9b65574e75295fa07f8f43 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -23,6 +23,10 @@
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(TensorUtils, Test) {
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f8bccd64e45f015a5c1aed44fbfdfc6f68660f1
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/utils.h"
+
+inline void run_program_dygraph_function(
+    const std::vector<paddle::experimental::Tensor>& x,
+    const std::vector<paddle::experimental::Tensor>& params,
+    std::vector<paddle::experimental::Tensor*>& out,     // NOLINT
+    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor*>& dout,    // NOLINT
+    const paddle::framework::AttributeMap& attrs) {
+  VLOG(2) << "start run run_program";
+  // Call forward function
+  RunProgramAPI(x, params, out, step_scope, dout, attrs);
+  VLOG(2) << "start run run_program grad";
+
+  // Prepare Autograd Meta
+  auto deref_out = details::DereferenceTensors(out);
+  std::vector<egr::AutogradMeta*> p_autograd_x =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*> p_autograd_params =
+      egr::EagerUtils::nullable_autograd_meta(params);
+  std::vector<egr::AutogradMeta*> p_autograd_outs =
+      egr::EagerUtils::nullable_autograd_meta(deref_out);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, &p_autograd_x, &p_autograd_params);
+
+  if (require_any_grad) {
+    std::vector<std::string> out_names;
+    for (auto& t : deref_out) {
+      out_names.emplace_back(t.name());
+    }
+
+    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
+    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
+    auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
+
+    grad_node->SetFwdOutNames(out_names);
+    // Set Attributes
+    grad_node->SetAttrMap(attrs);
+    // Set TensorWrappers
+    grad_node->SetFwdX(x);
+    grad_node->SetFwdParams(params);
+    grad_node->SetStepScope(step_scope);
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+
+    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    // Set Next Edges
+    grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
+    grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
+
+    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
+
+    // Set History for output set current Grad Node for
+    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
+    egr::EagerUtils::CheckAndRetainGrad(deref_out);
+  }
+}
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae5d86664a346fd8a1d877f9e1dd74f687302595
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -0,0 +1,468 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
+#include "paddle/fluid/operators/run_program_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace details {
+using Tensor = paddle::experimental::Tensor;
+
+static std::vector<Tensor> DereferenceTensors(
+    const std::vector<Tensor *> &tensor_ptr) {
+  std::vector<Tensor> res;
+  for (auto *t : tensor_ptr) {
+    res.emplace_back(*t);
+  }
+  return res;
+}
+
+static std::vector<std::string> GetTensorsName(const std::vector<Tensor> &ins) {
+  std::vector<std::string> in_names;
+  for (auto &in_t : ins) {
+    in_names.emplace_back(in_t.name());
+  }
+  return in_names;
+}
+
+static std::vector<std::string> GetTensorsName(
+    const std::vector<Tensor *> &ins) {
+  std::vector<std::string> in_names;
+  for (auto *in_t : ins) {
+    in_names.emplace_back(in_t->name());
+  }
+  return in_names;
+}
+
+static void CheckInputVarStatus(const Tensor &tensor) {
+  PADDLE_ENFORCE_EQ(
+      tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
+      paddle::platform::errors::InvalidArgument(
+          "The input tensor %s of "
+          "RunProgram(Grad)Op holds "
+          "wrong type. Expect type is DenseTensor.",
+          tensor.name()));
+
+  PADDLE_ENFORCE_EQ(tensor.initialized(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "The tensor in input tensor %s of "
+                        "RunProgram(Grad)Op "
+                        "is not initialized.",
+                        tensor.name()));
+}
+
+static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
+                                 const Tensor &dst_tensor) {
+  auto name = dst_tensor.name();
+  PADDLE_ENFORCE_EQ(dst_tensor.defined(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "dst_tensor shall be defined."));
+
+  if (phi::DenseTensor::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::DenseTensor>();
+    PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensor %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is DenseTensor",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's internal "
+                          "scope is not initialized.",
+                          name));
+  } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::SelectedRows>();
+    PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensodfr %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is SelectedRows",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's "
+                          "internal scope is not initialized.",
+                          name));
+
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The RunProgram(Grad)Op only support output "
+        "variable of type LoDTensor or SelectedRows",
+        name));
+  }
+}
+
+static void ShareTensorsIntoScope(const std::vector<Tensor> &tensors,
+                                  paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto name = tensors[i].name();
+    if (name == "Fake_var" || !tensors[i].is_initialized()) {
+      continue;
+    }
+    auto *var = scope->Var(name);
+    CheckInputVarStatus(tensors[i]);
+    // share tensor
+    auto tensor_base = tensors[i].impl();
+    if (phi::DenseTensor::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::DenseTensor>();
+      auto t = std::dynamic_pointer_cast<phi::DenseTensor>(tensor_base);
+      *dst_tensor = *t;
+    } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::SelectedRows>();
+      auto t = std::dynamic_pointer_cast<phi::SelectedRows>(tensor_base);
+      *dst_tensor = *t;
+    }
+  }
+}
+
+static void ShareTensorsFromScope(
+    const std::vector<Tensor *> &tensors,
+    const paddle::framework::BlockDesc &global_block,
+    paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all
+    // parameters before generating out_tmp have no @GRAD, it will raise error
+    // because we can't find them in scope. So we skip sharing these vars or
+    // var@GRAD if they don't appear in global block.
+    auto &name = tensors[i]->name();
+    if (name == paddle::framework::kEmptyVarName || name == "Fake_var" ||
+        !global_block.HasVar(name)) {
+      VLOG(2) << "find tensor name is " << name << ", skip it!";
+      continue;
+    }
+    // NOTE: Here skip not found var is dangerous, if a bug is caused here,
+    // the result is grad calculation error, which will be very hidden!
+    auto *var = scope->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound(
+                                     "The output tensor %s is not in "
+                                     "RunProgram(Grad)Op'"
+                                     "s internal scope.",
+                                     name));
+    CheckOutputVarStatus(*var, *tensors[i]);
+    // share tensor
+    // TODO(dev): Determine Tensor type by scope.var
+    // auto tensor_base = tensors[i]->impl();
+    // if (phi::DenseTensor::classof(tensor_base.get())) {
+    if (var->IsType<phi::DenseTensor>()) {
+      auto &src_tensor = var->Get<phi::DenseTensor>();
+      auto *dst_tensor = const_cast<phi::DenseTensor *>(
+          dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
+      VLOG(2) << "share " << name << " from scope";
+      *dst_tensor = src_tensor;
+    } else if (var->IsType<phi::SelectedRows>()) {
+      // } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto &src_tensor = var->Get<phi::SelectedRows>();
+      auto *dst_tensor = const_cast<phi::SelectedRows *>(
+          dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
+      *dst_tensor = src_tensor;
+    }
+  }
+}
+
+}  // namespace details
+
+inline void RunProgramAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    std::vector<paddle::experimental::Tensor *> &out,     // NOLINT
+    std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor *> &dout,    // NOLINT
+    const paddle::framework::AttributeMap &attrs) {
+  VLOG(2) << "RunProgramOpKernel Compute";
+  auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index"));
+  auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+  auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test"));
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+
+  // NOTE(chenweihang): In order not to add new variable type, use vector
+  // here. Originally, here can use scope directly.
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  // Step 2. prepare executor and init persistable variables
+
+  // NOTE(Aurelius84): While training some models, forward can be called many
+  // times and then apply backpropagation all at once, such as Reinforcement
+  // Learning. Tensor data in multi-step training should be saved into single
+  // scope separately. Otherwise, the gradients can be miscalculated because
+  // always using the Tensor data of the last step in forward.
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  VLOG(2) << "The number of sub scopes before forward: "
+          << out_scope_vec->front()->kids().size();
+  paddle::framework::Scope &scope = global_inner_scope->NewScope();
+
+  // share input_vars & parameters into scope
+  details::ShareTensorsIntoScope(x, &scope);
+  details::ShareTensorsIntoScope(params, &scope);
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto input_names = details::GetTensorsName(x);
+    auto output_names = details::GetTensorsName(out);
+    auto dout_names = details::GetTensorsName(dout);
+    auto *program = global_block->Program();
+
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad=*/false, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+    // all out_vars are skip_eager_var
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, false);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names);
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    output_names.begin(), output_names.end());
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    dout_names.begin(), dout_names.end());
+      paddle::framework::details::ParseSafeEagerDeletionSkipVars(
+          *program, end_op_index, output_names, &skip_eager_delete_vars);
+    }
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
+  }
+  // Step 4. Get Output
+  details::ShareTensorsFromScope(out, *global_block, &scope);
+  details::ShareTensorsFromScope(dout, *global_block, &scope);
+
+  // Debug info: scope info when run end
+  VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+  // Step 5. Drop all children scopes while testing.
+  if (is_test) {
+    out_scope_vec->front()->DropKids();
+  }
+  VLOG(2) << "The number of sub scopes after forward: "
+          << out_scope_vec->front()->kids().size();
+  // #ifdef PADDLE_WITH_MKLDNN
+  //     if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+  // #endif
+}
+
+inline void RunProgramGradAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    const std::vector<paddle::experimental::Tensor> &out_grad,
+    const std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    const paddle::framework::AttributeMap &attrs,
+    std::vector<paddle::experimental::Tensor *> &x_grad,      // NOLINT
+    std::vector<paddle::experimental::Tensor *> &params_grad  // NOLINT
+    ) {
+  // if all output vars are set to stop_gradient, grad op no need to executed
+  if (x_grad.empty() && params_grad.empty()) return;
+
+  // TODO(dev): Remove this line hard code. And need to deal with the out_grad
+  // name problem.
+  // const_cast<paddle::experimental::Tensor &>(out_grad[0])
+  //     .set_name("matmul_v2_0.tmp_0@GRAD");
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+  // NOTE: skip `shape` and `fill_constant` op created by
+  // fluid.backward.gradients, one forward output will generate one `shape`
+  // and `fill_constant`
+  int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2);
+  int64_t end_op_index = global_block->OpSize();
+
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  auto sub_scope_num = global_inner_scope->kids().size();
+  VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+  PADDLE_ENFORCE_GT(sub_scope_num, 0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The OutScope of RunProgramGradOp should hold at "
+                        "least one sub scope."));
+
+  auto &scope = *(global_inner_scope->kids().front());
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto out_grad_names = details::GetTensorsName(out_grad);
+    // NOTE: after PR22939 [Add double grad] merged, the grad op maker's
+    //   SetOutput will set to None if the input var stop_gradient=True,
+    //   it will cause an NotFound error when ctx.OutputNames() is called
+    std::vector<std::string> x_grad_names;
+    std::vector<std::string> param_grad_names;
+    if (!x_grad.empty()) {
+      x_grad_names = details::GetTensorsName(x_grad);
+    }
+    if (!params_grad.empty()) {
+      param_grad_names = details::GetTensorsName(params_grad);
+    }
+
+    // Step 2. prepare executor and scope
+    auto *program = global_block->Program();
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad*/ true, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, true);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names);
+
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    x_grad_names.begin(), x_grad_names.end());
+      paddle::framework::details::AppendSkipDeletionVars(
+          param_grad_names, &skip_eager_delete_vars);
+    }
+
+    details::ShareTensorsIntoScope(out_grad, &scope);
+    // Debug info: scope info when run end
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(
+        /*skip_eager_delete_vars=*/skip_eager_delete_vars);
+  }
+
+  // Step 4. get outputs
+  details::ShareTensorsFromScope(x_grad, *global_block, &scope);
+  details::ShareTensorsFromScope(params_grad, *global_block, &scope);
+
+  // Step5. drop current scope
+  // global_inner_scope->DeleteScope(&scope);
+  VLOG(2) << "The number of sub scopes after backward: "
+          << global_inner_scope->kids().size();
+}
+
+class GradNodeRunProgram : public egr::GradNodeBase {
+ public:
+  GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+
+  ~GradNodeRunProgram() override = default;
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
+      override {
+    VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
+    PADDLE_ENFORCE_EQ(
+        grads.size(), 1,
+        paddle::platform::errors::InvalidArgument(
+            "The out_grads.size() of RunProgramGradOp should be equal to 1."));
+
+    VLOG(3) << "out_grads[0].size() : " << grads[0].size();
+    std::vector<paddle::experimental::Tensor> x_grad;
+    std::vector<paddle::experimental::Tensor> params_grad;
+    ConstructGradTensors(x_, &x_grad);
+    ConstructGradTensors(params_, &params_grad);
+    std::vector<paddle::experimental::Tensor *> x_grad_ptr;
+    std::vector<paddle::experimental::Tensor *> params_grad_ptr;
+    for (auto &i : x_grad) {
+      x_grad_ptr.emplace_back(&i);
+    }
+    for (auto &i : params_grad) {
+      params_grad_ptr.emplace_back(&i);
+    }
+
+    // auto x_grad_ptr = ConstructGradTensors(x_);
+    // auto params_grad_ptr = ConstructGradTensors(params_);
+
+    PADDLE_ENFORCE_EQ(
+        grads[0].size(), fwd_out_names_.size(),
+        paddle::platform::errors::InvalidArgument(
+            "The grads[0].size() and fwd_out_names_.size() should be equal."));
+    for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
+      const_cast<paddle::experimental::Tensor &>(grads[0][i])
+          .set_name(fwd_out_names_[i] + "@GRAD");
+    }
+
+    RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr,
+                      params_grad_ptr);
+    VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
+    return {x_grad, params_grad};
+    // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
+  }
+
+  // SetAttrMap
+  void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
+    attrs_ = attrs;
+  }
+
+  void SetFwdX(const std::vector<paddle::experimental::Tensor> &tensors) {
+    x_ = tensors;
+  }
+
+  void SetFwdParams(const std::vector<paddle::experimental::Tensor> &tensors) {
+    params_ = tensors;
+  }
+
+  void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
+    step_scope_ = scopes;
+  }
+
+  void SetFwdOutNames(std::vector<std::string> out_names) {
+    fwd_out_names_ = out_names;
+  }
+
+ protected:
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors,
+      std::vector<paddle::experimental::Tensor> *grad_tensors) {
+    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // such as: name, tensor type(DenseTensor or SelectedRows).
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      grad_tensors->emplace_back(fwd_t.impl());
+      auto &grad_t = grad_tensors->back();
+      grad_t.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors) {
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad();
+      grad_tesnor.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+ private:
+  // TensorWrappers
+  std::vector<paddle::experimental::Tensor> x_;
+  std::vector<paddle::experimental::Tensor> params_;
+  std::vector<paddle::framework::Scope *> step_scope_;
+
+  std::vector<std::string> fwd_out_names_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attrs_;
+};
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index a7e5931f1f9bc66006fb1a37836be1eda371953e..8a57d2694535e9c27e88416468fe5a67ce020b43 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -122,12 +122,22 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
+    if (autograd_meta->GradNode()) {
+      VLOG(7) << "Should not set grad node twice, original node is:"
+              << autograd_meta->GradNode()->name()
+              << "current is: " << grad_node->name();
+    }
     autograd_meta->SetGradNode(grad_node);
   }
 }
 
 void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
+  if (autograd_meta->GradNode()) {
+    VLOG(7) << "Should not set grad node twice, original node is:"
+            << autograd_meta->GradNode()->name()
+            << "current is: " << grad_node->name();
+  }
   autograd_meta->SetGradNode(grad_node);
 }
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
old mode 100644
new mode 100755
index 14aecb5fd43c49ece1f79cb9c8e2b70e9d07df07..aa92a3b2226c1fca1fa7326e76ef29b0b38cd8d6
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -235,6 +235,7 @@ if(WITH_PYTHON)
   py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
   py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
+  py_proto_compile(ps_py_proto SRCS ps.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
   add_custom_target(fleet_proto_init ALL  
@@ -242,12 +243,13 @@ if(WITH_PYTHON)
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
   )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
@@ -259,6 +261,7 @@ if(WITH_PYTHON)
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
           COMMAND copy /Y *.py ${proto_dstpath}
+      COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
 	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
@@ -437,11 +440,10 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
deleted file mode 100644
index 49a1e0774a6b1a7a1afd154029850ceb52040759..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/custom_kernel.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include "paddle/fluid/framework/custom_kernel.h"
-#include "paddle/phi/core/custom_kernel.h"
-
-namespace paddle {
-namespace framework {
-
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
-#ifdef _LINUX
-  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
-  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
-      dlsym(dso_handle, "PD_GetCustomKernelMap"));
-
-  if (func == nullptr) {
-    LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetCustomKernelMap symbol in this lib.";
-    return;
-  }
-  auto& custom_kernel_map = func();
-  phi::RegisterCustomKernels(custom_kernel_map);
-  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
-#else
-  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
-#endif
-  return;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 66dfb81755f1c9cc16ab8a52df429af8d94ab718..948eaab40b4f64f2a87a83fab80d4eade5288e91 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass)
+    fix_op_run_order_pass fuse_gemm_epilogue_pass)
 
 if (WITH_CINN)
   set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index c99200ec98aa8f0736610f659d3b94e3c2f1e023..fdf74d2f769fcdd49da19c0118a23d6b8fbb06e4 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
+
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
+                        "fuse_gemm_epilogue_pass");
+#endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
     // for single card training, fuse_all_reduce_ops is unnecessary.
@@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass);
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+USE_PASS(fuse_gemm_epilogue_pass);
+#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 70a083dd70bc3b48bf24b050673f3da7b69b1755..5eb584aaefa981ab6c6f25df7a765ae9a3d0194a 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -124,6 +125,8 @@ struct BuildStrategy {
   paddle::optional<bool> fuse_broadcast_ops_{paddle::none};
   // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
+  // Fuse GEMM+Epilogue via cublasLt epilogue.
+  bool fuse_gemm_epilogue_{false};
 
   // mkldnn_enabled_op_types specify the operator type list to
   // use MKLDNN acceleration. It is null in default, means
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index b7cb2ce0f0102bd34940864960118f396c5dcad7..59220fc9cdaf1f05f70e8cfe961071c1fad3a760 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -186,45 +186,63 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
   container_->prefetch(cudaCpuDeviceId, stream);
+  std::vector<std::thread> threads;
   size_t num = container_->size();
   KeyType unuse_key = std::numeric_limits<KeyType>::max();
   thrust::pair<KeyType, ValType>* kv = container_->data();
-  for (size_t i = 0; i < num; ++i) {
-    if (kv[i].first == unuse_key) {
-      continue;
-    }
-    ValType& gpu_val = kv[i].second;
+
+  int thread_num = 8;
+  int len_per_thread = num / thread_num;
+  int remain = num % thread_num;
+  int begin = 0;
+
+  auto dump_func = [unuse_key, kv](int left, int right) {
+    for (int i = left; i < right; i++) {
+      if (kv[i].first == unuse_key) {
+        continue;
+      }
+      ValType& gpu_val = kv[i].second;
 #ifdef PADDLE_WITH_PSLIB
-    auto* downpour_value =
-        (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
-    int downpour_value_size = downpour_value->size();
-    if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
-      downpour_value->resize(gpu_val.mf_size + downpour_value_size);
-    }
-    float* cpu_val = downpour_value->data();
-    // cpu_val[0] = 0;
-    cpu_val[1] = gpu_val.delta_score;
-    cpu_val[2] = gpu_val.show;
-    cpu_val[3] = gpu_val.clk;
-    cpu_val[4] = gpu_val.lr;
-    cpu_val[5] = gpu_val.lr_g2sum;
-    cpu_val[6] = gpu_val.slot;
-    if (gpu_val.mf_size > 0) {
-      for (int x = 0; x < gpu_val.mf_size; x++) {
-        cpu_val[x + 7] = gpu_val.mf[x];
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
+        downpour_value->resize(gpu_val.mf_size + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      // cpu_val[0] = 0;
+      cpu_val[1] = gpu_val.delta_score;
+      cpu_val[2] = gpu_val.show;
+      cpu_val[3] = gpu_val.clk;
+      cpu_val[4] = gpu_val.lr;
+      cpu_val[5] = gpu_val.lr_g2sum;
+      cpu_val[6] = gpu_val.slot;
+      if (gpu_val.mf_size > 0) {
+        for (int x = 0; x < gpu_val.mf_size; x++) {
+          cpu_val[x + 7] = gpu_val.mf[x];
+        }
       }
-    }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-    auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
-    downpour_value->count_ = gpu_val.show;
-    for (int x = 0; x < gpu_val.mf_size; x++) {
-      downpour_value->data_[x] = gpu_val.mf[x];
-    }
+      auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
+      downpour_value->count_ = gpu_val.show;
+      for (int x = 0; x < gpu_val.mf_size; x++) {
+        downpour_value->data_[x] = gpu_val.mf[x];
+      }
 #endif
+    }
+  };
+
+  for (int i = 0; i < thread_num; i++) {
+    threads.push_back(std::thread(
+        dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0)));
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
   }
 
-  container_->prefetch(devid, stream);
+  // container_->prefetch(devid, stream);
 }
 
 template <typename KeyType, typename ValType>
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 9f2bdeffecf62764f5cbe5bea9cb50d4830be43b..c1f8041cc1eca34b858608ffb77598ce095d0b4f 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -231,19 +231,19 @@ void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
 CustomStreamGarbageCollector::CustomStreamGarbageCollector(
     const platform::CustomPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {
-  platform::DeviceGuard guard(place);
-  stream_.reset(new platform::stream::Stream);
+  phi::DeviceGuard guard(place);
+  stream_.reset(new phi::stream::Stream);
   stream_->Init(place);
-  callback_manager_.reset(new platform::CallbackManager(stream_.get()));
+  callback_manager_.reset(new phi::CallbackManager(stream_.get()));
 }
 
 CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
-  platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
+  phi::DeviceGuard guard(this->dev_ctx_->GetPlace());
   stream_->Synchronize();
   stream_->Destroy();
 }
 
-platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
+phi::stream::Stream *CustomStreamGarbageCollector::stream() const {
   return stream_.get();
 }
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index a67860c6087e0f173e09d2a7c131703260c562fd..f0027c676050b8c31c0bc0ca4ab3b6444f29e1a2 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -230,14 +230,14 @@ class CustomStreamGarbageCollector : public GarbageCollector {
 
   void Wait() const override;
 
-  platform::stream::Stream *stream() const;
+  phi::stream::Stream *stream() const;
 
  protected:
   void ClearCallback(const std::function<void()> &callback) override;
 
  private:
-  std::unique_ptr<platform::stream::Stream> stream_;
-  std::unique_ptr<platform::CallbackManager> callback_manager_;
+  std::unique_ptr<phi::stream::Stream> stream_;
+  std::unique_ptr<phi::CallbackManager> callback_manager_;
 };
 #endif
 
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index e14b91d935d05c12442f3d0205c1e97df9697ec3..29c7f5d0ce73cbf1af18e6f5869d59d2200917ad 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -88,6 +88,10 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsForInferShape() const override { return true; }
+
+  bool IsRuntime() const override { return ctx_.IsRuntime(); }
+
  private:
   const InferShapeContext& ctx_;
 };
@@ -127,7 +131,9 @@ class CompatMetaTensor : public phi::MetaTensor {
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
-      return phi::make_ddim(var->GetShape());
+
+      return var->GetShape().empty() ? phi::make_ddim({0UL})
+                                     : phi::make_ddim(var->GetShape());
     }
   }
 
@@ -228,16 +234,8 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
-  void share_meta(const MetaTensor& meta_tensor) override {
+  void share_dims(const MetaTensor& meta_tensor) override {
     set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
-    // special case 1: share lod of LoDTensor
-    share_lod(meta_tensor);
-
-    // special case 2: share height and rows of SelectedRows in runtime
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       if (var->IsType<phi::SelectedRows>()) {
@@ -250,6 +248,16 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
+  void share_meta(const MetaTensor& meta_tensor) override {
+    set_dtype(meta_tensor.dtype());
+    // VarDesc doesn't contains layout, so we cannot share layout
+    // set_layout(meta_tensor.layout());
+
+    // special case 1: share lod of LoDTensor
+    share_lod(meta_tensor);
+    share_dims(meta_tensor);
+  }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
@@ -308,22 +316,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   // TODO(chenweihang): support multiple inputs and outputs later
   phi::InferMetaContext infer_mete_context;
   for (auto& in_name : input_names) {
-    if (ctx->HasInput(in_name)) {
-      infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
-          ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
+    if (ctx->HasInputs(in_name)) {
+      auto input_var = ctx->GetInputVarPtrs(in_name);
+      if (input_var.size() == 1) {
+        infer_meta_context.EmplaceBackInput(
+            std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
+        inputs.reserve(input_var.size());
+        for (const auto& in : input_var) {
+          inputs.push_back(
+              std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackInputs(std::move(inputs));
+      }
     } else {
       infer_meta_context.EmplaceBackInput({nullptr});
     }
   }
 
-  for (auto& out_name : output_names) {
-    if (ctx->HasOutput(out_name)) {
-      infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
-          ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
-    } else {
-      infer_meta_context.EmplaceBackOutput({nullptr});
-    }
-  }
   auto attr_reader = ctx->Attrs();
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto attr_name = attr_names[i];
@@ -348,13 +359,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
           }
         } else {
           // If is not in runtime, we will set default value(-1) for ScalarArray
-          int64_t num_ele = 0;
           std::vector<VarDesc*> vars;
           vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); i++) {
+          for (size_t i = 0; i < infershape_inputs.size(); ++i) {
             vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
           }
 
+          int64_t num_ele = 0;
           if (vars.size() == 1) {
             num_ele = 1;
             const auto& tensor_dims = vars[0]->GetShape();
@@ -362,16 +373,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               num_ele *= tensor_dims[i];
             }
           } else {
-            for (auto& var : vars) {
-              const auto& tensor_dims = var->GetShape();
-              PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
-                                platform::errors::InvalidArgument(
-                                    "The shape is constructed by multi-tensor, "
-                                    "every tensor's dims should be 1. But your "
-                                    "shape has tensor that dims is %s.",
-                                    tensor_dims.size()));
-              num_ele += tensor_dims[0];
-            }
+            num_ele = vars.size();
           }
           phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
           tensor_attr.SetFromTensor(true);
@@ -383,10 +385,18 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             std::type_index(typeid(std::vector<int32_t>))) {
           infer_meta_context.EmplaceBackAttr(std::move(
               phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int64_t>))) {
+          infer_meta_context.EmplaceBackAttr(std::move(
+              phi::ScalarArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported cast op attribute `%s` to ScalarArray when "
-              "construct KernelContext.",
+              "construct InferMetaContext.",
               attr_name));
         }
       }
@@ -414,7 +424,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
         }
       } else if (ctx->HasInput(attr_name)) {
         const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
-
         if (infershape_input.size() == 1) {
           if (ctx->IsRuntime()) {
             Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
@@ -490,6 +499,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
+    } else {
+      // do nothing
+    }
+  }
+
+  for (auto& out_name : output_names) {
+    if (ctx->HasOutputs(out_name)) {
+      auto output_var = ctx->GetOutputVarPtrs(out_name);
+      if (output_var.size() == 1) {
+        infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
+            output_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
+        outputs.reserve(output_var.size());
+        for (const auto& out : output_var) {
+          outputs.emplace_back(
+              std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackOutputs(std::move(outputs));
+      }
+    } else {
+      infer_meta_context.EmplaceBackOutput({nullptr});
     }
   }
 
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 64c8371d583ffef621e5009504d14308dd7b997c..b692b6ffab08014f7de6ef4e5488445204396853 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -29,7 +29,7 @@ namespace framework {
 phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
                                             const std::string& op_type);
 
-#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
+#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
   struct functor_name : public paddle::framework::InferShapeBase {  \
     void operator()(                                                \
         paddle::framework::InferShapeContext* ctx) const override { \
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 53dcc19fcbae88ab5ccfcc498037327946029927..2eeefb19a1aa8c5c9e4f92ff06618c719bb30785 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel(
 }  // namespace framework
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
+DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
                             InferShapeUtilsTestInferShapeFunctor,
-                            PT_INFER_META(paddle::framework::TestInferMeta));
+                            PD_INFER_META(paddle::framework::TestInferMeta));
 REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOp,
                   paddle::framework::InferShapeUtilsTestOpMaker,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index dad5358590cb1497453681ce940898314a1d06eb..a1f2d6edca6a2db5d5bb4c8cf896c492f20ed2da 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
-pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
@@ -158,6 +157,7 @@ endif()
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
deleted file mode 100644
index f28c9988bd858ad00a5c5a532b7b484315557d8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ /dev/null
@@ -1,420 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
-
-#include <cmath>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Node;
-
-#define GET_CONV_BN_NODES(pattern_name)                                    \
-  /* OPERATORS */                                                          \
-  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
-  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
-  /* CONV inputs */                                                        \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
-  /* CONV outputs */                                                       \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
-  /* Affine Channel inputs */                                              \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
-  /* Affine channel outputs */                                             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
-
-void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
-                                const ir::Node& ac_scale,
-                                const LoDTensor& ac_bias_tensor,
-                                LoDTensor* eltwise_y_in_tensor) {
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using ConstEigenVectorArrayMap =
-      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using EigenMatrixArrayMap = Eigen::Map<
-      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(
-      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
-      platform::errors::InvalidArgument(
-          "Tensor elementwise y(%d) and activation bias(%d) must have same "
-          "dimension.",
-          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
-
-  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
-
-  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-                                       scale_tensor->numel(), 1);
-  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
-                                         ac_bias_tensor.numel(), 1);
-
-  EigenVectorArrayMap eltwise_y_in_array(
-      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-      eltwise_y_in_tensor->numel(), 1);
-
-  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
-
-  // Re-compute weight of conv2d from AffineChannel
-  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  auto weights_shape = weights->dims();
-  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
-  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
-
-  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
-                                       weights_shape_2d[1]);
-
-  weights_array_2d.colwise() *= scale_array;
-
-  // Check for subnormal values that slows down convolution execution
-  for (int i = 0; i < weights->numel(); ++i) {
-    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
-  }
-}
-
-ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvAffineChannel fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
-                                 "it's wrong if data_format of conv is not "
-                                 "NCHW.";
-    }
-
-    // Get affine_channel bias for resizing eltwise_y!
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    // Create eltwise_y (conv bias) variable
-    VarDesc eltwise_y_in_desc(
-        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
-    // Set shape && datatype manually
-    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
-    eltwise_y_in_desc.SetDataType(
-        framework::TransToProtoVarType(ac_bias_tensor->dtype()));
-    eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
-    eltwise_y_in_desc.SetPersistable(true);
-
-    // Initialize eltwise_y
-    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
-    auto* eltwise_y_in_tensor =
-        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
-    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
-    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-                eltwise_y_in_tensor->numel(), 0.0f);
-
-    // update weights and biases
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // create an elementwise add node.
-    OpDesc desc;
-    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-    desc.SetType("elementwise_add");
-    desc.SetAttr("axis", 1);
-    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
-
-    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
-
-    IR_NODE_LINK_TO(conv_out, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_op, ac_out);
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_ac_count);
-}
-
-ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING)
-          << "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvBN fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
-                                 "enabled, it's wrong if data_format of conv "
-                                 "is not NCHW.";
-    }
-    // OPERATORS
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
-    // BIAS inputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
-    // BIAS outputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
-
-    // Get eltwise_y (conv bias) variable
-    auto* eltwise_y_in_tensor =
-        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
-
-    // Get batch norm bias
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // Update the elementwise_add node
-    eltwise->Op()->SetAttr("axis", 1);
-    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-
-    GraphSafeRemoveNodes(graph,
-                         {ac_scale, ac_bias, affine_channel, eltwise_out});
-
-    IR_NODE_LINK_TO(eltwise, ac_out);
-
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_conv_ac_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvAffineChannelFusePass);
-REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
-REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .EQ("affine_channel", 0));
-REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .LE("elementwise_add", 1)
-            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
deleted file mode 100644
index 8cfaf5c6a89f06b453dbbc94b5a7fe8b83e5c111..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the Conv and ConvAffineChannel.
- */
-class Graph;
-
-class ConvAffineChannelFusePass : public FusePassBase {
- public:
-  ConvAffineChannelFusePass();
-  virtual ~ConvAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_affine_channel_fuse"};
-};
-
-class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
- public:
-  ConvEltwiseAddAffineChannelFusePass();
-  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f48224cbdc24fe9706a3c4eae029c6dc35381ad2
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -0,0 +1,471 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h"
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const {
+  EpiloguePassActivationCache cache;
+
+  graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache);
+  graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache);
+  graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache);
+  graph = FuseLinearFwd(graph, false);
+  graph = FuseLinearFwd(graph, true);
+  graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache);
+  graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache);
+  graph = FuseLinearBwd(graph, false);
+  graph = FuseLinearBwd(graph, true);
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
+                                               bool is_training) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, {}, is_training, false);
+
+  int found_linear_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    std::string activation = "none";
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, ele_out);
+
+    GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name();
+    found_linear_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types,
+    bool is_training, bool is_act_grad_x_from_act,
+    EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act);
+
+  int found_linear_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    auto activation = act_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, act_out);
+
+    // Only need to check weight.shape[1] for auxiliary pointer
+    // and mark it the act op is fused for backward epilogue fusion.
+    // That because cuBlasLt epilogue's restriction.
+    if (is_training) {
+      int divisor_of_n = activation == "relu" ? 128 : 8;
+      if (matmul_w_shape[1] % divisor_of_n) return;
+
+      VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace"));
+      auto *reserve_space_node = g->CreateVarNode(&reserve_space);
+
+      cache->InsertFusedActivation(
+          GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()),
+          reserve_space_node);
+
+      gemm_epilogue_node->Op()->SetOutput("ReserveSpace",
+                                          {reserve_space_node->Name()});
+
+      if (!is_act_grad_x_from_act) {
+        GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern);
+        act_grad_op->Op()->RenameInput(ele_out->Name(),
+                                       reserve_space_node->Name());
+        IR_NODE_LINK_TO(reserve_space_node, act_grad_op);
+      }
+      IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node);
+    }
+
+    GraphSafeRemoveNodes(g,
+                         {matmul_op, matmul_out, ele_add_op, ele_out, act_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> "
+            << act_out->Name();
+    found_linear_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
+                                               bool without_x_gradient) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+
+    Node *matmul_grad_dx = nullptr;
+    if (!without_x_gradient) {
+      GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx,
+                                ele_add_matmul_act_pattern);
+      matmul_grad_dx = matmul_grad_dx_ptr;
+    }
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    std::string activation_grad = "none";
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    if (matmul_grad_dx) {
+      fused_gemm_epilogue_grad_op_desc.SetOutput("DX",
+                                                 {matmul_grad_dx->Name()});
+    }
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    if (matmul_grad_dx) {
+      IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx);
+    }
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op});
+
+    std::string matmul_grad_dx_name =
+        matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " ";
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_w->Name() << " and " << matmul_grad_dx_name;
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+    bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, act_grad_types, false,
+                             is_act_grad_x_from_act);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx,
+                              ele_add_matmul_act_pattern);
+
+    auto key =
+        GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId());
+    if (!cache->HasFusedActivation(key)) {
+      return;
+    }
+    auto *reserve_space_node = cache->GetFusedActivationSpace(key);
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    auto activation_grad = act_grad_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace",
+                                              {reserve_space_node->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node);
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op,
+                             matmul_grad_dx, act_grad_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name()
+            << "\n\t " << matmul_grad_dx->Name() << " -> "
+            << act_grad_op->Name() << " -> " << act_grad_dx->Name();
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+bool FuseGemmEpiloguePass::IsGemmFromLinear_(
+    const std::vector<int64_t> &x_shape, const std::vector<int64_t> &w_shape,
+    OpDesc *matmul_v2_op) const {
+  if (w_shape.size() != 2 || x_shape.size() < 2) return false;
+  for (auto attr_name :
+       {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y",
+        "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) {
+    if (matmul_v2_op->HasAttr(attr_name)) {
+      std::vector<int> tmp_vec =
+          BOOST_GET_CONST(std::vector<int>, matmul_v2_op->GetAttr(attr_name));
+      if (tmp_vec.size() > 0) return false;
+    }
+  }
+  if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) ||
+      BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y")))
+    return false;
+
+  return true;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_gemm_epilogue_pass,
+              paddle::framework::ir::FuseGemmEpiloguePass);
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..575ffee73d60e9bd5d4f5af7538d01789268cc9a
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the ElewiseAdd and activation
+ */
+class Graph;
+class Node;
+
+class EpiloguePassActivationCache {
+ public:
+  EpiloguePassActivationCache() {}
+
+  EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete;
+  void operator=(const EpiloguePassActivationCache &) = delete;
+
+  bool HasFusedActivation(const std::string &key) const {
+    return fused_activation_space_map_.count(key);
+  }
+
+  ir::Node *GetFusedActivationSpace(const std::string &key) {
+    if (HasFusedActivation(key)) {
+      return fused_activation_space_map_.find(key)->second;
+    }
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The key (%d) of EpiloguePassActivationCache does not exist.", key));
+  }
+
+  void InsertFusedActivation(const std::string &key, ir::Node *const value) {
+    if (!HasFusedActivation(key)) {
+      mtx.lock();
+      fused_activation_space_map_.insert({key, value});
+      mtx.unlock();
+    } else {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "The key (%d) of EpiloguePassActivationCache already exist.", key));
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, ir::Node *> fused_activation_space_map_;
+  std::mutex mtx;
+};
+
+class FuseGemmEpiloguePass : public FusePassBase {
+ public:
+  virtual ~FuseGemmEpiloguePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const;
+  ir::Graph *FuseLinearActFwd(ir::Graph *graph,
+                              const std::unordered_set<std::string> &act_types,
+                              bool is_training, bool is_act_grad_x_from_act,
+                              EpiloguePassActivationCache *cache) const;
+  ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const;
+  ir::Graph *FuseLinearActBwd(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+      bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const;
+
+ private:
+  bool IsGemmFromLinear_(const std::vector<int64_t> &x_shape,
+                         const std::vector<int64_t> &w_shape,
+                         OpDesc *matmul_v2_op) const;
+  const std::string GetReserveSpaceCacheKey(const std::string var_name,
+                                            int block_id) const {
+    return std::to_string(block_id) + var_name;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index e4c9dc72128f4850b2e0e4af739fdd381e4a3b1e..d7d866fa98bb5895e4f3175e227f7b3c2ce869b6 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1461,31 +1461,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()(
   return bn_grad;
 }
 
-PDNode *patterns::ElewiseAddAct::operator()(
-    paddle::framework::ir::PDNode *ele_x_var,
-    std::unordered_set<std::string> act_types) {
-  auto *ele_y_var = pattern->NewNode(ele_y_repr())
-                        ->assert_is_op_input("elementwise_add", "Y");
-
-  auto *ele_add =
-      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
-
-  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
-                          ->assert_is_op_output("elementwise_add", "Out");
-
-  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
-
-  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
-
-  auto *act_out_var =
-      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
-
-  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
-  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
-
-  return act_out_var;
-}
-
 PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
     paddle::framework::ir::PDNode *d_act_out_var,
     std::unordered_set<std::string> act_types) {
@@ -1526,6 +1501,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
+PDNode *patterns::ElewiseAddAct::operator()(
+    paddle::framework::ir::PDNode *ele_x_var,
+    std::unordered_set<std::string> act_types) {
+  auto *ele_y_var = pattern->NewNode(ele_y_repr())
+                        ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+
+  auto *act_out_var =
+      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
+
+  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
+  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+  return act_out_var;
+}
+
+PDNode *patterns::LinearAct::operator()(
+    paddle::framework::ir::PDNode *linear_x_var,
+    const std::unordered_set<std::string> &act_types, bool with_grad_link,
+    bool is_act_grad_x_from_act) {
+  auto *matmul_w_var =
+      pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+
+  auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2");
+
+  auto *matmul_out_var = pattern->NewNode(matmul_out_repr())
+                             ->assert_is_op_output("matmul_v2", "Out");
+
+  matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X");
+
+  auto *ele_bias_var = pattern->NewNode(ele_bias_repr())
+                           ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var});
+  ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var});
+
+  if (with_grad_link) {
+    matmul_out_var->assert_is_op_input("elementwise_add_grad", "X");
+    auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad")
+                                        ->assert_is_op("elementwise_add_grad");
+    elementwise_add_grad_op->LinksFrom({matmul_out_var});
+  }
+
+  if (act_types.size() > 0) {
+    ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+    auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+    auto *act_out_var = pattern->NewNode(act_out_repr())
+                            ->assert_is_ops_output(act_types, "Out");
+
+    act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+    if (with_grad_link && !is_act_grad_x_from_act) {
+      std::unordered_set<std::string> act_grad_types;
+      for (const auto &act : act_types) {
+        std::string act_grad(act);
+        act_grad.append("_grad");
+        act_grad_types.insert(act_grad);
+      }
+
+      ele_out_var->assert_is_ops_input(act_grad_types, "X");
+      auto *act_grad_op =
+          pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+      act_grad_op->LinksFrom({ele_out_var});
+    }
+
+    return act_out_var;
+  }
+
+  return ele_out_var;
+}
+
+PDNode *patterns::ElewiseAddMatmulAct::operator()(
+    paddle::framework::ir::PDNode *dout_var,
+    const std::unordered_set<std::string> &act_grad_types,
+    bool without_x_gradient, bool is_act_grad_x_from_act) {
+  auto *ele_grad_bias_var =
+      pattern->NewNode(ele_grad_bias_repr())
+          ->assert_is_op_input("elementwise_add_grad", "Y");
+  auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr())
+                           ->assert_is_op("elementwise_add_grad");
+  auto *ele_grad_dx_var =
+      pattern->NewNode(ele_grad_dx_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
+  auto *ele_grad_dbias_var =
+      pattern->NewNode(ele_grad_dbias_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("Y"));
+  ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var})
+      .LinksTo({ele_grad_dx_var, ele_grad_dbias_var});
+
+  ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad",
+                                                        GradVarName("Out"));
+
+  auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "X");
+  auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "Y");
+  auto *matmul_grad =
+      pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad");
+  auto *matmul_grad_dx_var =
+      pattern->NewNode(matmul_grad_dx_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("X"));
+  auto *matmul_grad_dw_var =
+      pattern->NewNode(matmul_grad_dw_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("Y"));
+  matmul_grad->LinksFrom(
+      {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var});
+  if (without_x_gradient) {
+    matmul_grad->LinksTo({matmul_grad_dw_var});
+  } else {
+    matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var});
+  }
+
+  if (!without_x_gradient && act_grad_types.size() > 0) {
+    matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input(
+        act_grad_types, GradVarName("Out"));
+
+    auto *act_grad =
+        pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+    auto *act_grad_dx_var =
+        pattern->NewNode(act_grad_dx_repr())
+            ->assert_is_ops_output(act_grad_types, GradVarName("X"));
+
+    auto *act_grad_x_var = matmul_grad_x_var;
+    if (!is_act_grad_x_from_act) {
+      auto *ele_out_var = pattern->NewNode(ele_out_repr())
+                              ->assert_is_ops_input(act_grad_types, "X");
+      act_grad_x_var = ele_out_var;
+    }
+
+    act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var})
+        .LinksTo({act_grad_dx_var});
+    return act_grad;
+  }
+
+  return matmul_grad;
+}
+
 // conv_type: conv2d, conv3d, conv2d_transpose
 PDNode *patterns::ConvBias::operator()(
     paddle::framework::ir::PDNode *conv_input, std::string conv_type) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d6400ed6945bf8a60c1d4f357bf58a11d5b87094..0f21906d08d0e4fc8a54472ab40ceb08df9d1949 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -863,6 +863,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(ele_y);
 };
 
+// The following patterns are used to fuse linear and act (ReLu or GeLU)
+// formula: act(F.linear(x))
+// op: matmul_v2 + elementwise_add + act
+// named nodes: matmul, elementwise_add, act
+//              matmul_w, matmul_out
+//              ele_bias, elewise_add_out, act_out
+struct LinearAct : public PatternBase {
+  LinearAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "linear_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_types,
+                     bool with_grad_link, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(matmul);
+  PATTERN_DECL_NODE(ele_add);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(matmul_w);
+  PATTERN_DECL_NODE(matmul_out);
+  PATTERN_DECL_NODE(elewise_add_out);
+  PATTERN_DECL_NODE(ele_bias);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// The following patterns are used to fuse linear_grad and act_grad (ReLu or
+// GeLU)
+// formula: the backward of F.linear( act(x) )
+// op: elementwise_add_grad + matmul_v2_grad + act_grad
+// named nodes: ele_add_grad, matmul_grad, act_grad
+//              ele_grad_bias, ele_grad_dx, ele_grad_dbias
+//              matmul_grad_x, matmul_grad_dx, matmul_grad_dx
+//              matmul_grad_dw, act_grad_dx
+struct ElewiseAddMatmulAct : public PatternBase {
+  ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_grad_types,
+                     bool without_x_gradient, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_add_grad);
+  PATTERN_DECL_NODE(matmul_grad);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(ele_out);
+  PATTERN_DECL_NODE(ele_grad_bias);
+  PATTERN_DECL_NODE(ele_grad_dx);
+  PATTERN_DECL_NODE(ele_grad_dbias);
+  PATTERN_DECL_NODE(matmul_grad_x);
+  PATTERN_DECL_NODE(matmul_grad_w);
+  PATTERN_DECL_NODE(matmul_grad_dx);
+  PATTERN_DECL_NODE(matmul_grad_dw);
+  PATTERN_DECL_NODE(act_grad_dx);
+};
+
 // Conv with Elementwise_add as bias
 // op: conv + elementwise_add
 // named nodes:
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index d33dc7f49feb0f4c9e585d13186d65b6c2d618c0..636a594a657cb0744aac161d928ff9078b1f92bc 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -20,12 +20,15 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(elementwise_add_grad);
 
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+
 DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index c537d05738529dcb885e86cbcabf4405fd75270b..2403e60df3918394e99c3284b2a417e336fc3bae 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
@@ -135,157 +136,9 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .End();
 }
 
-ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
-        get_node_from_conv_op,
-    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_op{get_node_from_conv_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_op;
-  Node* conv_input;
-  Node* conv_filter;
-  Node* conv_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_identity;
-  Node* elementwise_add_out;
-
-  std::tie(conv_op, conv_input, conv_filter, conv_output) =
-      get_node_from_conv_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
-
-  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
-
-  if (HasFusedActivation(conv_op)) return;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
-
-  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-  conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_x_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_y_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_x_op{get_node_from_conv_x_op},
-      get_node_from_conv_y_op{get_node_from_conv_y_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_x_op;
-  Node* conv_x_input;
-  Node* conv_x_filter;
-  Node* conv_x_output;
-
-  Node* conv_y_op;
-  Node* conv_y_input;
-  Node* conv_y_filter;
-  Node* conv_y_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_out;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
-
-  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
-      get_node_from_conv_x_op(subgraph);
-  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
-      get_node_from_conv_y_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
-  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
-
-  Node* projection_node;
-  Node* residual_conv_op;
-  Node* residual_conv_output;
-
-  if (IsReachable(graph, conv_x_input, conv_y_output)) {
-    projection_node = conv_x_output;
-    residual_conv_op = conv_y_op;
-    residual_conv_output = conv_y_output;
-  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
-    projection_node = conv_y_output;
-    residual_conv_op = conv_x_op;
-    residual_conv_output = conv_x_output;
-  } else {
-    return;
-  }
-
-  if (HasFusedActivation(residual_conv_op)) return;
-
-  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-
-  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(projection_node, residual_conv_op);
-  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-std::tuple<Node*, Node*, Node*, Node*>
-ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
-    const patterns::Conv& conv_pattern,
-    const GraphPatternDetector::subgraph_t& subgraph) const {
-  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
-
-  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
-}
-
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
     const std::string& name_scope,
     const GraphWithStats& graph_with_stats) const {
-  ir::Graph* graph;
-  int stats;
-
-  std::tie(graph, stats) = graph_with_stats;
-
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
 
@@ -298,26 +151,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
       pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_conv_as_x_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_add_identity, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+
+    found_conv_as_x_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_x_count
+           << " conv (as x) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_x_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
@@ -335,26 +218,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
       conv_output);
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_conv_as_y_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_add_x, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(elementwise_add_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+
+    found_conv_as_y_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_y_count
+           << " conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_y_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
@@ -374,39 +287,84 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
-      &gpd, graph_with_stats,
-      [this,
-       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_x_pattern, subgraph);
-      },
-      [this,
-       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_y_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_projection_conv_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    Node* projection_node;
+    Node* residual_conv_op;
+    Node* residual_conv_output;
+    if (IsReachable(g, conv_x_input, conv_y_output)) {
+      projection_node = conv_x_output;
+      residual_conv_op = conv_y_op;
+      residual_conv_output = conv_y_output;
+    } else if (IsReachable(g, conv_y_input, conv_x_output)) {
+      projection_node = conv_y_output;
+      residual_conv_op = conv_x_op;
+      residual_conv_output = conv_x_output;
+    } else {
+      return;
+    }
+
+    if (HasFusedActivation(residual_conv_op)) return;
+
+    residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+
+    residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(projection_node, residual_conv_op);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+
+    found_projection_conv_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_projection_conv_count
+           << " projection conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_projection_conv_count + graph_with_stats.second);
 }
 
-void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
-  auto fused_graph_with_stats = FuseConvAsY(
-      name_scope_,
-      FuseConvAsX(name_scope_,
-                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
+  auto graph_with_stats =
+      FuseProjectionConv(name_scope_, std::make_pair(graph, 0));
+  graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats);
+  graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats);
 
-  LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n";
-  AddStatis(fused_graph_with_stats.second);
+  AddStatis(graph_with_stats.second);
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index c83335da2f629c128fcf4819b2645ab1ef5eae42..c4351b382187d9062a059d013ddb237520645b6d 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,19 +28,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class Graph;
-class GraphPatternDetector;
-class Node;
-namespace patterns {
-struct Conv;
-}  // namespace patterns
-
-using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
-paddle::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
  private:
@@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
       const std::string& name_scope,
       const GraphWithStats& graph_with_stats) const;
 
-  template <typename RetType>
-  using GetNodeFunc =
-      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
-  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
-  using IdentityElementwiseAddFunc =
-      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
-
-  using ProjectionConvFunc = IdentityConvFunc;
-  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
-
-  using CanFuseFunc = std::function<bool(Node*, Node*)>;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  template <typename HandleType, typename... OpFuncs>
-  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
-                                      const GraphWithStats& graph_with_stats,
-                                      OpFuncs&&... op_funcs) const {
-    ir::Graph* graph;
-    int stats;
-
-    std::tie(graph, stats) = graph_with_stats;
-
-    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
-      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
-    };
-    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
-
-    (*gpd)(graph, fuse_handle);
-
-    return std::make_pair(graph, stats + fuse_handle.get_stats());
-  }
-
-  struct IdentityFuseHandle {
-    IdentityFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const IdentityConvFunc& get_node_from_conv_op,
-        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    IdentityConvFunc get_node_from_conv_op;
-    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
-  struct ProjectionFuseHandle {
-    ProjectionFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const ProjectionConvFunc& get_node_from_conv_x_op,
-        const ProjectionConvFunc& get_node_from_conv_y_op,
-        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    ProjectionConvFunc get_node_from_conv_x_op;
-    ProjectionConvFunc get_node_from_conv_y_op;
-    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
  public:
   ResidualConnectionMKLDNNFusePass();
   virtual ~ResidualConnectionMKLDNNFusePass() {}
 
  protected:
-  void ApplyImpl(graph_ptr graph) const;
+  void ApplyImpl(ir::Graph* graph) const;
+
   static bool HasFusedActivation(Node* conv_node) {
     return !(conv_node->Op()
                  ->GetAttrIfExists<std::string>("fuse_activation")
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 96aa95bde337436dd6eb584b3eea5395b5301a34..11190309814e7c75777a6cddd7e4d24bfc7ba9e6 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
 #include <string>
+#include <unordered_set>
 
-#include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>
-#include <random>
-#include <unordered_set>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
@@ -25,7 +26,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 0a95444f852dd0abdd150d04dc7536e26151c218..d578ada0db00fed85f7b4f25f1483169c72c2c0b 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -15,8 +15,9 @@
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 
 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
 #include <unordered_set>
+
+#include <boost/logic/tribool.hpp>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -27,7 +28,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
 USE_OP(gelu);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 2c3359ffa8e46f0d30a01d73fccb95d88771480a..219aae71127ed8963b4bfe4e8ee5e7259dbf7d02 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -37,7 +37,7 @@ USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
@@ -46,7 +46,7 @@ USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
-USE_OP(elementwise_mul_grad);
+USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP(sigmoid_grad);
 USE_OP(tanh_grad);
 USE_OP(sum);
@@ -54,7 +54,7 @@ USE_OP(slice_grad);
 USE_OP(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
 USE_OP(sgd);
 USE_OP(squared_l2_norm);
 USE_OP(memcpy_h2d);
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 7b3916bafc93eda8cb1afbf54b706e032c5233dd..bc65231abe7371a931f709c9190b55fde24f0543 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -409,7 +409,7 @@ class ThreadPoolTempl {
       return false;
     }
     platform::RecordEvent("SleepWaitForWork",
-                          platform::TracerEventType::UserDefined, 2);
+                          platform::TracerEventType::UserDefined, 10);
     ec_.CommitWait(waiter);
     blocked_--;
     return true;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index c45bf32d8b710cb35ec5f86a4a8ba2e1078537e6..eb40a49b4066a7a8c8e9c142a310b815fd73da20 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
     return 0;                                                            \
   }
 
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class, \
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
+  REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
         paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
         paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d33791f70c4d2f759bcd4f6443a5a1f244673d4f..f8e30c1ee294ecf692e2992b6123232ba1c8bd7d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -254,7 +254,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with CustomDevice support.",
           place));
 #else
-      platform::DeviceManager::SetDevice(place);
+      phi::DeviceManager::SetDevice(place);
 #endif
     }
 
@@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
       // and different op name cost time,we set two event.
       platform::RecordEvent op_type_record_event(
           Type(), platform::TracerEventType::Operator, 1);
-      // auto op_name = platform::OpName(outputs_, Type());
-      // platform::RecordEvent op_name_record_event(
-      //     op_name, platform::TracerEventType::Operator, 1,
-      //     platform::EventRole::kUniqueOp);
+      auto op_name = platform::OpName(outputs_, Type());
+      platform::RecordEvent op_name_record_event(
+          op_name, platform::TracerEventType::Operator, 10,
+          platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
     }
 
@@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const {
   return var != nullptr;
 }
 
+bool ExecutionContext::HasInputs(const std::string& name) const {
+  const auto& ins = ctx_.inputs;
+  auto it = ins.find(name);
+  if (it == ins.end() || it->second.empty()) {
+    return false;
+  }
+  for (const auto* input : it->second) {
+    if (input == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool ExecutionContext::HasOutput(const std::string& name) const {
   auto* var = OutputVar(name);
   return var != nullptr;
@@ -1210,6 +1224,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
                 << "` not found.";
       }
+    } else {
+      pt_kernel_name = pt_kernel_signature_->name;
+      pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
     }
 #ifdef PADDLE_WITH_XPU
     bool is_xpu_unsupport =
@@ -2048,7 +2065,11 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
-         std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
+             std::type_index(
+                 typeid(paddle::optional<const phi::DenseTensor&>)) ||
+         input_defs[i].type_index ==
+             std::type_index(
+                 typeid(paddle::optional<const phi::SelectedRows&>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
@@ -2074,6 +2095,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
     }
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done inputs";
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto it = ctx.outputs.find(output_names[i]);
@@ -2098,26 +2120,25 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
-      } else if (var->template IsType<phi::SelectedRows>()) {
-        tensor_out = var->template GetMutable<phi::SelectedRows>();
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported output `%s` type when call pt kernel.",
-            framework::ToTypeName(var->Type())));
-      }
 
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
+      if (var) {
+        if (var->template IsType<framework::LoDTensor>()) {
+          tensor_out = var->template GetMutable<framework::LoDTensor>();
+        } else if (var->template IsType<phi::SelectedRows>()) {
+          tensor_out = var->template GetMutable<phi::SelectedRows>();
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported output `%s` type when call pt kernel.",
+              framework::ToTypeName(var->Type())));
+        }
+      }
 
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
 
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done outputs";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
@@ -2182,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext(
             std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = Attrs().at(attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
       auto& attr = Attrs().at(attr_names[i]);
@@ -2205,15 +2271,17 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
-
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
         const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
@@ -2226,6 +2294,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     }
   }
+  VLOG(4) << "Done attributes";
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 16718a316513e3574e9a7eb14ed50106c8b0dcb6..1a1171f1dba4d794796ef1421fe386f60a0e587d 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -295,6 +295,8 @@ class ExecutionContext {
 
   virtual bool HasInput(const std::string& name) const;
 
+  virtual bool HasInputs(const std::string& name) const;
+
   virtual bool HasOutput(const std::string& name) const;
 
   virtual size_t InputSize(const std::string& name) const {
@@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
       : ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
-    return ctx_.HasInput(name);
+    return ctx_.HasInputs(name);
   }
 
   bool HasOutput(const std::string& name) const override {
@@ -489,6 +491,8 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return ctx_.OutputVar(name)->IsType<phi::SelectedRows>();
   }
 
+  bool IsForInferShape() const override { return false; }
+
  private:
   const ExecutionContext& ctx_;
 };
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index bf9d1baaf394f05d125563311dd2047383373834..47dffd47b7cbbf4a37e6715b40d41024330bc679 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 
 USE_PASS(build_cinn_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(relu_grad);
+USE_OP_ITSELF(relu_grad);
 USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 706815185a1b5b53d1bb8e26274206abc126cfd5..c015e90f71e54691e92c3a36c3d6e053372f64f3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -241,7 +241,6 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
-  options.with_buffer_handle_instruction_inserted = true;
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index e8badab27b9b97aade81bf496ce211485f924757..cdccc4c5546900a141a084281f419c2940b23817 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) {
 USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 355291beb60f949b52b681592d42b7da4e80186b..14997dd9610138e32a45ef17abc9276cd1dad172 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
     library_type = LibraryType::kMKLDNN;
   } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
     library_type = LibraryType::kCUDNN;
+  } else if (kernel_key.backend() == phi::Backend::KPS) {
+    library_type = LibraryType::kKP;
   } else {
     // do nothing
   }
@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
     backend = phi::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
     backend = phi::Backend::GPUDNN;
+  } else if (kernel_type.library_type_ == LibraryType::kKP) {
+    backend = phi::Backend::KPS;
   } else {
     // do
   }
@@ -121,6 +125,15 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
   }
+#endif
+#ifdef PADDLE_WITH_IPU
+  if (platform::is_ipu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "pten missing IPU kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
+                          kernel_key.dtype());
+  }
 #endif
   return phi::KernelKey();
 }
@@ -229,26 +242,5 @@ static void SetAllocationForUninitializedDenseTensor(
   dense_tensor->ResetHolder(shared_allocation);
 }
 
-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place) {
-  if (phi::DenseTensor::classof(tensor)) {
-    auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
-    if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(dense_tensor, place);
-    }
-  } else if (phi::SelectedRows::classof(tensor)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
-    if (!selected_rows->value().IsInitialized() ||
-        !(selected_rows->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
-                                               place);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported tensor type is received when setting allocation for "
-        "output tensor."));
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 1a1f79d82770058ae4010b7a3a3162280ceb1537..a17578816921b2337a76d1a0a69a6c8adbc51c4d 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -62,9 +62,6 @@ class KernelArgsNameMaker {
 
 void InitDefaultKernelSignatureMap();
 
-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place);
-
 // TODO(Wilber): support others device context.
 template <typename T>
 struct ConvertToPhiContext {
diff --git a/paddle/fluid/framework/ps.proto b/paddle/fluid/framework/ps.proto
new file mode 100755
index 0000000000000000000000000000000000000000..0ae87812bce434be5e664aefea4bba19ae147d28
--- /dev/null
+++ b/paddle/fluid/framework/ps.proto
@@ -0,0 +1,213 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+option cc_generic_services = true;
+option cc_enable_arenas = true;
+
+message FsClientParameter {
+  enum FsApiType {
+    HDFS = 0;
+    AFS = 1;
+  }
+  optional FsApiType fs_type = 1 [ default = HDFS ];
+  optional string uri = 2;        // such as afs://xxx.afs.com:9902
+  optional string user = 3;       // user_name to access fs
+  optional string passwd = 4;     // password
+  optional int32 buffer_size = 5; // buffer for read/write
+  optional string hadoop_bin = 51;
+  optional string afs_conf = 101;
+}
+
+message PSParameter {
+  optional string worker_class = 1;
+  optional string server_class = 2;
+  optional string instance_class = 3;
+  optional string init_gflags = 4 [ default = "" ];
+  optional WorkerParameter worker_param = 101;
+  optional ServerParameter server_param = 102;
+  repeated DownpourTrainerParameter trainer_param = 301;
+  optional FsClientParameter fs_client_param = 501;
+}
+
+message WorkerParameter {
+  optional DownpourWorkerParameter downpour_worker_param = 1;
+}
+
+message DownpourWorkerParameter {
+  repeated TableParameter downpour_table_param = 1;
+}
+
+message DownpourServerParameter {
+  repeated TableParameter downpour_table_param = 1;
+  optional ServerServiceParameter service_param = 2;
+}
+
+message ServerParameter {
+  optional DownpourServerParameter downpour_server_param = 1;
+}
+
+message DownpourTrainerParameter {
+  repeated DenseTableParameter dense_table = 1;
+  repeated SparseTableParameter sparse_table = 2;
+  optional int32 push_sparse_per_batch = 3;
+  optional int32 push_dense_per_batch = 4;
+  repeated string skip_op = 5;
+  repeated ProgramConfig program_config = 6;
+}
+
+message DenseTableParameter {
+  optional int32 table_id = 1;
+  repeated string dense_variable_name = 2;
+  repeated string dense_gradient_variable_name = 3;
+  optional int32 fea_dim = 4;
+}
+
+message SparseTableParameter {
+  optional int32 table_id = 1;
+  optional int32 feature_dim = 2;
+  repeated string slot_key = 3;
+  repeated string slot_value = 4;
+  repeated string slot_gradient = 5;
+}
+
+message ServerServiceParameter {
+  optional string server_class = 1 [ default = "BrpcPsServer" ];
+  optional string client_class = 2 [ default = "BrpcPsClient" ];
+  optional string service_class = 3 [ default = "BrpcPsService" ];
+  optional uint32 start_server_port = 4
+      [ default = 0 ]; // will find a avaliable port from it
+  optional uint32 server_thread_num = 5 [ default = 12 ];
+}
+
+message ProgramConfig {
+  required string program_id = 1;
+  repeated int32 push_sparse_table_id = 2;
+  repeated int32 push_dense_table_id = 3;
+  repeated int32 pull_sparse_table_id = 4;
+  repeated int32 pull_dense_table_id = 5;
+}
+
+enum TableType {
+  PS_SPARSE_TABLE = 0;
+  PS_DENSE_TABLE = 1;
+  PS_OTHER_TABLE = 2;
+}
+
+message TableParameter {
+  optional uint64 table_id = 1;
+  optional string table_class = 2;
+  optional uint64 shard_num = 3 [ default = 1000 ];
+  optional TableAccessorParameter accessor = 4;
+  optional TensorAccessorParameter tensor = 5;
+  optional CommonAccessorParameter common = 6;
+  optional TableType type = 7;
+  optional bool compress_in_save = 8 [ default = false ];
+}
+
+message TableAccessorParameter {
+  optional string accessor_class = 1;
+  optional uint32 fea_dim = 4 [ default = 11 ];
+  optional uint32 embedx_dim = 5 [ default = 8 ];
+  optional uint32 embedx_threshold = 6 [ default = 10 ];
+  optional CtrAccessorParameter ctr_accessor_param = 7;
+  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
+  optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
+  optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
+}
+
+message CtrAccessorParameter {
+  optional float nonclk_coeff = 1
+      [ default = 0.1 ]; // to calculate show_click_score
+  optional float click_coeff = 2
+      [ default = 1 ]; // to calculate show_click_score
+  optional float base_threshold = 3 [
+    default = 1.5
+  ]; // show_click_score > base_threshold, this feature can be saved
+  optional float delta_threshold = 4
+      [ default =
+            0.25 ]; // delta_score > delta_threshold, this feature can be saved
+  optional float delta_keep_days = 5
+      [ default =
+            16 ]; // unseen_day < delta_keep_days, this feature can be saved
+  optional float show_click_decay_rate = 6 [
+    default = 0.98
+  ]; // show/click will update to show/click * show_click_decay_rate after a day
+  optional float delete_threshold = 7
+      [ default = 0.8 ]; // threshold to shrink a feasign
+  optional float delete_after_unseen_days = 8
+      [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature
+                        // will be delete in shrink_model
+  optional int32 ssd_unseenday_threshold = 9
+      [ default = 1 ]; // threshold to save ssd
+}
+
+message TensorAccessorParameter {
+  optional string feed_var_name = 1;
+  optional string fetch_var_name = 2;
+  optional int64 startup_program_id = 3;
+  optional int64 main_program_id = 4;
+  optional string tensor_table_class = 6;
+}
+
+message CommonAccessorParameter {
+  optional string name = 1;
+  optional string table_name = 2;
+  repeated string attributes = 3;
+  repeated string params = 4;
+  repeated uint32 dims = 5;
+  repeated string initializers = 6;
+  optional string entry = 7;
+  optional int32 trainer_num = 8;
+  optional bool sync = 9;
+  optional uint32 table_num = 10;
+  optional uint32 table_dim = 11;
+}
+
+message TableAccessorSaveParameter {
+  optional uint32 param = 1;
+  optional string converter = 2;
+  optional string deconverter = 3;
+}
+
+message SparseCommonSGDRuleParameter {
+  optional string name = 1;
+  optional SparseNaiveSGDRuleParameter naive = 2;
+  optional SparseAdagradSGDRuleParameter adagrad = 3;
+  optional SparseAdamSGDParameter adam = 4;
+}
+
+message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  repeated float weight_bounds = 3;
+}
+
+message
+    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_g2sum = 2 [ default = 3.0 ];
+  optional double initial_range = 3 [ default = 0.0001 ];
+  repeated float weight_bounds = 4;
+}
+
+message SparseAdamSGDParameter { // SparseAdamSGDRule
+  optional double learning_rate = 1 [ default = 0.001 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  optional double beta1_decay_rate = 3 [ default = 0.9 ];
+  optional double beta2_decay_rate = 4 [ default = 0.999 ];
+  optional double ada_epsilon = 5 [ default = 1e-08 ];
+  repeated float weight_bounds = 6;
+}
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index f198919b0c87bb4f2ea9991e401a8242676d3f46..3d8a5ab21f00fcc4137d177b741023a827e325d7 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -33,6 +33,7 @@ if(NOT WIN32)
     endif()
     if(WITH_CNCL)
         cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+	cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
     if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
         cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
@@ -41,13 +42,17 @@ if(NOT WIN32)
 endif(NOT WIN32)
 if(WITH_GLOO)
     cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits)
-    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) ))
+    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) ))
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
 endif()
 
+if(WITH_MLU)
+    SET(MLU_DEPS mlu_baseop)
+endif()
+
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS})
 else()
 cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor)
 endif()
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 8373c7fe50d0222d6b38a400e82239dc8c3590ad..7416d206fc43eaf5a56c3eb606bb0672d1172c0b 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -317,6 +317,7 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
         auto tmp_var = var;
         for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
           tmp_var = (*hook_pair.second)(tmp_var);
+          CheckVar(var, tmp_var);
         }
         (*tmp_ins_ptr)[pair.first][i] = tmp_var;
       }
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index fe5ac73b0046915c4a52087ed792925b0b0ed200..fbc47f81fd33169f54aeb2c251f9b6c90cb44637 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext {
     return (it != var_map_in_.end() && it->second.size() > 0);
   }
 
+  bool HasInputs(const std::string& name) const override {
+    auto it = var_map_in_.find(name);
+    return (it != var_map_in_.end() && it->second.size() > 0);
+  }
+
   bool HasOutput(const std::string& name) const override {
     auto it = var_map_out_.find(name);
     return (it != var_map_out_.end() && it->second.size() > 0);
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 0abc5ad90e2697eb78ff1e21ceb2bc0e97e14a44..12aa13bbacc3bae5d690323f45817f95762c376c 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -732,6 +732,7 @@ void GradientAccumulator::CallGradientHooks() {
             << var_->GetVariableWrapperHooks().size();
     for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
       tmp_var = (*hook_pair.second)(tmp_var);
+      CheckVar(inner_var_, tmp_var);
     }
     inner_var_ = tmp_var;
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e74711c2a796576d55e06cdfb59efa074324a71f..03f6775defc2f8fccba0654ae5d366d66ad88fc0 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -179,5 +179,29 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 template <typename VarType>
 void TensorAdd(const VarType& src, VarType* dst);
 
+inline void CheckVar(const std::shared_ptr<VariableWrapper>& pre,
+                     const std::shared_ptr<VariableWrapper>& post) {
+  if (pre->IsEmpty() && !post->IsEmpty()) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "The tensor(%s) in before and after hook are not consistent",
+        pre->Name()));
+  }
+  if (!pre->IsEmpty() && !post->IsEmpty()) {
+    VLOG(4) << pre->DataType() << " " << post->DataType();
+    PADDLE_ENFORCE_EQ(
+        pre->DataType(), post->DataType(),
+        platform::errors::PermissionDenied(
+            "The dtype of tensor(%s) before(%s) and after(%s) hook are not "
+            "consistent",
+            pre->Name(), framework::DataTypeToString(pre->DataType()),
+            framework::DataTypeToString(post->DataType())));
+    PADDLE_ENFORCE_EQ(pre->Place(), post->Place(),
+                      platform::errors::PermissionDenied(
+                          "The place of tensor(%s) before(%s) and after(%s) "
+                          "hook are not consistent",
+                          pre->Name(), pre->Place(), post->Place()));
+  }
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 9dd1dacc02c25474803ef3177d9cd967ee681714..bae49fb381a475dd8227d1dc855a6db28c9cd273 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -186,11 +186,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
               << " | kernel key: " << pt_kernel_key
               << " | kernel: " << pt_kernel;
 
-      if (platform::is_cpu_place(expected_kernel_key.place_)) {
-        auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace());
-        return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                          pt_kernel, cpu_ctx);
+      if (expected_kernel_key.place_ != place) {
+        dev_ctx = pool.Get(expected_kernel_key.place_);
       }
+
       // TODO(chenweihang): using CPUKernel when miss device kernel case
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
                         pt_kernel, dev_ctx);
@@ -248,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
+  expected_kernel_key.place_ = platform::XPUPlace();
   bool use_xpu_kp_kernel_rt =
       FLAGS_run_kp_kernel &&
       paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 8e1e2fbe9a12da672a633075ed4c41d3d62cd7e1..d7c0c8cc547e6b04f67ddbb06121d139756d5142 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -314,27 +314,25 @@ void BuildDygraphPhiKernelContext(
 
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset]->MutableVar();
-      if (var->template IsType<phi::DenseTensor>()) {
-        tensor_out = var->template GetMutable<phi::DenseTensor>();
-      } else if (var->template IsType<phi::SelectedRows>()) {
-        tensor_out = var->template GetMutable<phi::SelectedRows>();
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported output `%s` type when call pt kernel.",
-            framework::ToTypeName(var->Type())));
+      if (var) {
+        if (var->template IsType<phi::DenseTensor>()) {
+          tensor_out = var->template GetMutable<phi::DenseTensor>();
+        } else if (var->template IsType<phi::SelectedRows>()) {
+          tensor_out = var->template GetMutable<phi::SelectedRows>();
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported output `%s` type when call pt kernel.",
+              framework::ToTypeName(var->Type())));
+        }
       }
 
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      framework::SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
+    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -412,6 +410,60 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<bool>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
@@ -435,7 +487,11 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          kernel_ctx->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 3a6365b2af21ae9012fe37293699caed9bb23855..fec9afbf3b403ca2fd45633326c7f7dec46e1243 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -31,7 +31,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #ifdef PADDLE_WITH_XPU_BKCL
 // TODO(liuyuhui) support xpu about div nranks in the future
 #endif
+  } else if (platform::is_mlu_place(tensor->place())) {
+    // TODO(zhangna)
+    VLOG(4) << "divnrank for mlu not support yet";
   }
 }
 
@@ -222,6 +225,56 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
+#ifdef PADDLE_WITH_CNCL
+// context is used to select the stream for concat
+template <>
+void ConcatTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    const std::vector<framework::Tensor> &dense_tensors_,
+    framework::Variable *p_dense_contents,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case framework::proto::VarType::FP32:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+
+// context is used to select the stream for split
+template <>
+void SplitTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case framework::proto::VarType::FP32:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+#endif
+
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
@@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't concat npu grads since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    ConcatTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        dense_tensors_, &dense_contents_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat mlu grads since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     ConcatTensorsWithType(
@@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't split npu grad since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    SplitTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        &dense_contents_, &dense_tensors_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split mlu grad since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     SplitTensorsWithType(
@@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
         // TODO(liuyuhui) support XPU set constant
         VLOG(3) << "XPU doesn't support set_constant";
       }
+#elif defined(PADDLE_WITH_CNCL)
+      if (platform::is_mlu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support MLU set constant
+        VLOG(3) << "MLU doesn't support set_constant";
+      }
 #else
       auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
       if (HasGrad(var_index)) {
@@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) {
         cv_.notify_all();
       }
     });
-#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
+#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) ||    \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_CNCL)
     FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Not compiled with BKCL or NCCL or GLOO."));
+        "Not compiled with BKCL or NCCL or CNCL or GLOO."));
 #endif
   }
 }
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index cca773b840c279f05cd6bcd0ed82fda7fdd55a25..9fac4b41cbde01f365dcc603844b06c473a58843 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -45,7 +45,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index e4f1cfdb3baeed9b5945b7843b6593528df48c29..09de0106ed6190c5f627ba9fb7cc038593b5088a 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
 cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
 cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op)
-if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
+if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL)
 cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 6c304278d21fde7af093b25cdd8f62a1d4528d31..5e674af1a08a87c11bfab1080be42e623661b38e 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) {
       value.push_back(static_cast<T>(1.0 * j));
     }
 
-    if (std::is_same<Place, platform::CUDAPlace>::value) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (std::is_same<Place, platform::CUDAPlace>::value ||
+        std::is_same<Place, platform::MLUPlace>::value) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_CNCL)
       paddle::memory::Copy(place, data, cpu_place, value.data(),
                            sizeof(T) * value.size(), 0);
 #endif
@@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) {
 }
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+TEST(TestGroup, TestMLUConcatSplit) {
+  platform::MLUPlace mlu_place(0);
+  platform::CPUPlace cpu_place;
+
+  int size = 3;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+
+  size = 15;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 3ac2028790608529e0745dde2ce41ed57748f46d..02a1689c23a3fe5e1543a2e52d7661d5997bc062 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -24,6 +24,10 @@
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index f5ca13cb99ad3df6b9283565b5681c36f7197ae8..4cda3f32fdf3fdd2d14b201fa902c1f50f3ff98d 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -24,6 +24,13 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
@@ -226,7 +233,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
 }  // namespace paddle
 
 USE_OP_ITSELF(split);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 #ifdef PADDLE_WITH_MKLDNN
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 #endif
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index d05036f7a12ebdc3db5fbfda5eb50c295c0478e4..2e38bd77cf63cc85b75a50e62250a6e746f525bc 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -28,6 +28,13 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
@@ -591,5 +598,5 @@ TEST(test_tracer, eager_tracer) {
 USE_OP(mul);
 USE_OP(mul_grad);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 85bcbd1458f24a592b646dfcda750f37f113f73f..01c9d2847e0c850fd4159613a47d647bdbf46c31 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -18,12 +18,14 @@
 #include <utility>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/common/place.h"
 
 DECLARE_bool(use_mkldnn);
 DECLARE_string(tracer_mkldnn_ops_on);
@@ -253,7 +255,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 #endif
     } else if (platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-      platform::DeviceManager::SetDevice(place);
+      phi::DeviceManager::SetDevice(place);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with CustomDevice if use "
@@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
   return false;
 }
 
+phi::KernelSignature Tracer::GetExpectedKernelSignature(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+  auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
+  framework::RuntimeContext ctx({}, {});
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(phi::CPUPlace());
+  const auto& op_info = op->Info();
+  auto* attr_checker = op_info.Checker();
+  if (attr_checker) {
+    attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
+  }
+  static paddle::framework::AttributeMap empty_attrs_map = {};
+  const paddle::framework::AttributeMap& default_attrs =
+      attr_checker == nullptr ? empty_attrs_map
+                              : attr_checker->GetDefaultAttrMap();
+  auto dygraph_exe_ctx =
+      imperative::DygraphExecutionContext<imperative::VarBase>(
+          *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
+          default_attrs);
+  auto* opbase_with_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(op.get());
+  PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr,
+                    platform::errors::InvalidArgument(
+                        "This op type:`%s` is not a OperatorWithKernel, only "
+                        "OperatorWithKernel can get KernelSignature",
+                        type));
+  return phi::KernelSignature(
+      std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx)));
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 73ecbbe6143ca8e68049c2d2886e9eee93b741f1..fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace imperative {
@@ -154,6 +155,10 @@ class Tracer {
     }
   }
 
+  phi::KernelSignature GetExpectedKernelSignature(
+      const std::string& type, const NameVarBaseMap& ins,
+      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 26b8b9e8e17e046964d648f564c26293036e4033..5d0c3c98d2f618eb1f3d41e6a4e2434e5cd80401 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -45,6 +45,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
+
+if(WITH_ONNXRUNTIME)
+  set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor)
+endif()
+
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
@@ -91,6 +96,13 @@ if (WITH_PSCORE)
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service)
 endif ()
 
+if (WITH_ONNXRUNTIME)
+  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} 
+      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc
+  )
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor)
+endif (WITH_ONNXRUNTIME)
+
 # Create shared inference library
 cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
     DEPS ${SHARED_INFERENCE_DEPS})
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 87efe5ec5190372b48f1bd6387e1c92f456865a1..bdc16ef4c7907764473c552461cde35f011ad489 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
-    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel)
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
 
 if(WITH_CRYPTO)
     list(APPEND paddle_inference_api_deps paddle_crypto)
@@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+if (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
+    cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor)
+else (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+endif (WITH_ONNXRUNTIME)
+
 
 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
@@ -75,6 +82,16 @@ elseif (WIN32)
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if (NOT APPLE AND NOT WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  elseif (WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps}
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  endif()
+endif()
+
 if(WITH_TESTING AND WITH_MKLDNN)
   if (NOT APPLE AND NOT WIN32)
     cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index fd2ccffae3b4af3280f622722d6080d7c68bfbad..41c01d3b7e261314d8dc6b852f5b2a597421fe48 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
   Update();
 }
 
+void AnalysisConfig::EnableONNXRuntime() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  use_onnxruntime_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()";
+  use_onnxruntime_ = false;
+#endif
+
+  Update();
+}
+
+void AnalysisConfig::DisableONNXRuntime() {
+  use_onnxruntime_ = false;
+  Update();
+}
+
+void AnalysisConfig::EnableORTOptimization() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  enable_ort_optimization_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()";
+  enable_ort_optimization_ = false;
+#endif
+
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
@@ -274,6 +301,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(ipu_available_memory_proportion_);
   CP_MEMBER(ipu_enable_half_partial_);
 
+  // fleet exe related
+  CP_MEMBER(dist_config_);
+
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cd6e3a3c759c05bda34978dd78d07358aacd53fe..871ed596a3ee9d6362b03e99ca10313765826a51 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/version.h"
@@ -47,6 +48,14 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/utils/string/split.h"
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
@@ -56,6 +65,10 @@
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #endif
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
@@ -71,6 +84,8 @@ using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
 #endif
 
+int AnalysisPredictor::clone_num_ = 1;
+
 namespace {
 bool IsPersistable(const framework::VarDesc *var) {
   if (var->Persistable() &&
@@ -186,14 +201,14 @@ bool AnalysisPredictor::Init(
     return false;
   }
 
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+
   // Prepare executor, create local variables.
   if (!PrepareExecutor()) {
     return true;
   }
 
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
-
   return true;
 }
 
@@ -359,6 +374,13 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
+    return PrepareFleetExecutor();
+  }
+#endif
   DisablePrepareDataOpt(inference_program_, 0, false);
 
   executor_->Prepare(sub_scope_, *inference_program_, 0,
@@ -371,6 +393,226 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+bool AnalysisPredictor::PrepareFleetExecutor() {
+  VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
+  if (config_.dist_config().nranks() > 1 && !CommInit()) {
+    return false;
+  }
+  task_node_.reset(new distributed::TaskNode(inference_program_.get(),
+                                             config_.dist_config().rank()));
+  // With auto cut, there is no concept of pp, no need to add dependency.
+  task_node_->SetType("Compute");
+  task_node_->Init(config_.use_feed_fetch_ops_enabled());
+  executor_desc_ = distributed::FleetExecutorDesc();
+  executor_desc_.set_cur_rank(config_.dist_config().rank());
+  std::unordered_map<int64_t, int64_t> id_to_rank;
+  for (int i = 0; i < config_.dist_config().nranks(); ++i) {
+    distributed::RankInfo *rank_info = executor_desc_.add_cluster_info();
+    rank_info->set_rank(i);
+    rank_info->set_ip_port(config_.dist_config().trainer_endpoints()[i]);
+    id_to_rank.insert({i, i});
+  }
+  fleet_exe_.reset(new distributed::FleetExecutor(executor_desc_));
+  // NOTE: Vars of feed fetch ops are not persistable,
+  // which will result in that those vars will be created in
+  // the subscope (microscope) in fleet executor. This will
+  // cause that the GetInputTensor/GetOutputTensor funct
+  // in analysis predictor cannot find those vars in the scope
+  // returned by the DistModel, since DistModel only return the
+  // root scope. So, those vars must  to be created in the root
+  // scope instead of in the microscope
+  std::vector<std::string> feed_fetch_vars;
+  for (auto pair : idx2feeds_) {
+    feed_fetch_vars.emplace_back(pair.second);
+  }
+  for (auto pair : idx2fetches_) {
+    feed_fetch_vars.emplace_back(pair.second);
+  }
+  fleet_exe_->Init(config_.dist_config().carrier_id(),
+                   *(inference_program_.get()), scope_.get(), place_, 1,
+                   {task_node_.get()}, id_to_rank, feed_fetch_vars);
+  return true;
+}
+
+bool AnalysisPredictor::CommInit() {
+  std::map<int64_t, std::vector<int64_t>> ring_id_to_ranks{};
+  std::map<int64_t, std::vector<int64_t>> rank_to_ring_ids{};
+  if (!LoadConverterConfig(&ring_id_to_ranks, &rank_to_ring_ids)) {
+    VLOG(3) << "Load converter config failed, DistModel init failed.";
+    return false;
+  }
+  std::unique_ptr<framework::ProgramDesc> comm_init_program(
+      new framework::ProgramDesc());
+  framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0);
+  std::vector<int64_t> &ring_ids =
+      rank_to_ring_ids[config_.dist_config().rank()];
+  int64_t order = 0;
+  std::string var_name_base = "comm_init_";
+  for (int64_t ring_id : ring_ids) {
+    VLOG(3) << "Init comm for ring id: " << ring_id;
+    int64_t ranks_in_group = ring_id_to_ranks[ring_id].size();
+    int64_t rank_in_group = 0;
+    std::vector<int64_t> &ranks = ring_id_to_ranks[ring_id];
+    for (int64_t rank : ranks) {
+      if (config_.dist_config().rank() == rank) {
+        break;
+      }
+      rank_in_group += 1;
+    }
+    std::vector<std::string> peer_endpoints;
+    for (int64_t rank : ranks) {
+      if (config_.dist_config().rank() == rank) {
+        continue;
+      }
+      peer_endpoints.emplace_back(
+          config_.dist_config().trainer_endpoints()[rank]);
+    }
+    InsertCommOp(var_name_base + std::to_string(order), ranks_in_group,
+                 rank_in_group, peer_endpoints, comm_init_block, ring_id);
+    order += 1;
+  }
+  framework::NaiveExecutor e(place_);
+  e.CreateVariables(*comm_init_program, 0, true, scope_.get());
+  e.Prepare(scope_.get(), *comm_init_program, 0, false);
+  e.Run();
+  VLOG(3) << "Comm init successful.";
+  return true;
+}
+
+void AnalysisPredictor::InsertCommOp(
+    std::string tmp_var_name, int nranks, int rank,
+    const std::vector<std::string> &peer_endpoints, framework::BlockDesc *block,
+    int ring_id) {
+  /*
+   * tmp_var_name: the var name for var comm_id
+   * nranks: number of total ranks
+   * rank: the rank of local rank in the comm group
+   * peer_endpoints: peer's endpoints
+   * block: the block where to insert the comm ops
+   * ring_id: the ring_id to be inited
+   */
+  const std::string &endpoint = config_.dist_config().current_endpoint();
+  std::stringstream ss;
+  ss << "Init comm with tmp var: " << tmp_var_name
+     << ". The ring id is: " << ring_id << ". The group has: " << nranks
+     << " ranks. Current rank in the group is: " << rank
+     << ". The endpoint is: " << endpoint << ". Peer endpoints are: ";
+  for (auto ep : peer_endpoints) {
+    ss << ep << ", ";
+  }
+  VLOG(3) << ss.str();
+  if (config_.use_gpu()) {
+    framework::VarDesc *new_var = block->Var(tmp_var_name);
+    new_var->SetType(framework::proto::VarType::RAW);
+    new_var->SetPersistable(true);
+    framework::OpDesc *gen_nccl_id_op = block->AppendOp();
+    gen_nccl_id_op->SetType("c_gen_nccl_id");
+    gen_nccl_id_op->SetOutput("Out", {tmp_var_name});
+    gen_nccl_id_op->SetAttr("rank", rank);
+    gen_nccl_id_op->SetAttr("endpoint",
+                            config_.dist_config().current_endpoint());
+    gen_nccl_id_op->SetAttr("other_endpoints", peer_endpoints);
+    gen_nccl_id_op->SetAttr("ring_id", ring_id);
+    gen_nccl_id_op->SetAttr("op_role",
+                            static_cast<int>(framework::OpRole::kForward));
+    gen_nccl_id_op->CheckAttrs();
+    framework::OpDesc *comm_init_op = block->AppendOp();
+    comm_init_op->SetType("c_comm_init");
+    comm_init_op->SetInput("X", {tmp_var_name});
+    comm_init_op->SetAttr("rank", rank);
+    comm_init_op->SetAttr("nranks", nranks);
+    comm_init_op->SetAttr("ring_id", ring_id);
+    comm_init_op->SetAttr("op_role",
+                          static_cast<int>(framework::OpRole::kForward));
+    comm_init_op->CheckAttrs();
+  } else {
+    LOG(WARNING) << "DistModelInf doesn't init comm.";
+    // TODO(fleet exe dev): comm init for more devices
+  }
+}
+
+bool AnalysisPredictor::LoadConverterConfig(
+    std::map<int64_t, std::vector<int64_t>> *ring_id_to_ranks,
+    std::map<int64_t, std::vector<int64_t>> *rank_to_ring_ids) {
+  VLOG(3) << "Going to load converter config from: "
+          << config_.dist_config().comm_init_config() << "\n";
+  std::ifstream fin(config_.dist_config().comm_init_config(), std::ios::in);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin.is_open()), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file is normal.",
+          config_.dist_config().comm_init_config()));
+  std::string line;
+  bool ring_to_rank{true};
+  // Reading config from file, the config file should like these format
+  //  [ring_id -> ranks]
+  //  0,0,1,2,3
+  //  1,0,1
+  //  2,2,3
+  //  21,0,1
+  //  22,1,2
+  //  23,2,3
+  //  [rank -> ring_ids]
+  //  0,0,1,21
+  //  1,0,1,21,22
+  //  2,0,2,22,23
+  //  3,0,2,23
+  while (std::getline(fin, line)) {
+    std::vector<std::string> one_line = paddle::string::Split(line, ',');
+    if (one_line.size() == 1) {
+      // start a new section of the config
+      if (line == "[ring_id -> ranks]") {
+        ring_to_rank = true;
+      } else if (line == "[rank -> ring_ids]") {
+        ring_to_rank = false;
+      }
+    } else {
+      // parse key - values pairs in one section
+      int64_t key = std::stoll(one_line[0]);
+      for (size_t i = 1; i < one_line.size(); ++i) {
+        int64_t val = std::stoll(one_line[i]);
+        if (ring_to_rank) {
+          if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) {
+            ring_id_to_ranks->insert({key, std::vector<int64_t>()});
+          }
+          ring_id_to_ranks->at(key).emplace_back(val);
+        } else {
+          if (rank_to_ring_ids->find(key) == rank_to_ring_ids->end()) {
+            rank_to_ring_ids->insert({key, std::vector<int64_t>()});
+          }
+          rank_to_ring_ids->at(key).emplace_back(val);
+        }
+        // NOTE: add more configuration sections here
+      }
+    }
+  }
+  std::stringstream ss;
+  ss << "Loaded the following converter config:\n";
+  ss << "ring_id_to_ranks:\n";
+  for (auto pair : *ring_id_to_ranks) {
+    int64_t key = pair.first;
+    ss << "\t" << key << "\t->\t";
+    for (auto value : pair.second) {
+      ss << value << "\t";
+    }
+    ss << "\n";
+  }
+  ss << "rank_to_ring_ids:\n";
+  for (auto pair : *rank_to_ring_ids) {
+    int64_t key = pair.first;
+    ss << "\t" << key << "\t->\t";
+    for (auto value : pair.second) {
+      ss << value << "\t";
+    }
+    ss << "\n";
+  }
+  VLOG(3) << ss.str();
+  return true;
+}
+#endif
+
 void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #ifdef PADDLE_WITH_MKLDNN
   std::vector<std::vector<int>> inputs_shape;
@@ -946,13 +1188,24 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
+  framework::Scope *scope;
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    scope = scope_.get();
+  } else {
+    scope = executor_->scope();
+  }
+#else
+  scope = executor_->scope();
+#endif
   PADDLE_ENFORCE_NOT_NULL(
-      executor_->scope()->FindVar(name),
+      scope->FindVar(name),
       platform::errors::PreconditionNotMet(
-          "The variable named %s is not found in the scope of the exector.",
+          "The variable named %s is not found in the scope of the executor.",
           name));
   std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+      new ZeroCopyTensor(static_cast<void *>(scope)));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -985,13 +1238,24 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
+  framework::Scope *scope;
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    scope = scope_.get();
+  } else {
+    scope = executor_->scope();
+  }
+#else
+  scope = executor_->scope();
+#endif
   PADDLE_ENFORCE_NOT_NULL(
-      executor_->scope()->FindVar(name),
+      scope->FindVar(name),
       platform::errors::PreconditionNotMet(
-          "he variable named %s is not found in the scope of the exector.",
+          "The variable named %s is not found in the scope of the executor.",
           name));
   std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+      new ZeroCopyTensor(static_cast<void *>(scope)));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -1023,6 +1287,18 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    VLOG(3) << "ZeroCopyRun will use the fleet executor.";
+    inference::Timer timer;
+    timer.tic();
+    fleet_exe_->Run(config_.dist_config().carrier_id());
+    VLOG(3) << "Fleet executor inf runs once use: "
+            << std::to_string(timer.toc()) << "ms";
+    return true;
+  }
+#endif
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 #ifdef PADDLE_WITH_MKLDNN
   if (config_.use_mkldnn_) {
@@ -1035,7 +1311,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
     MkldnnPreSet(shape_vector);
   }
 #endif
-
   executor_->Run();
 
   if (config_.shape_range_info_collected()) {
@@ -1364,7 +1639,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
-  x->executor_->ResetTrtOps(++x->clone_num_);
+  x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
@@ -1491,6 +1766,27 @@ namespace paddle_infer {
 Predictor::Predictor(const Config &config) {
   const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
   // The second parameter indicates that the discard log is not printed
+  if (config.use_onnxruntime()) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+    if (config.use_gpu()) {
+      LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU,"
+                      "and it falls back to use Paddle Inference.";
+    } else if (!paddle::CheckConvertToONNX(config)) {
+      LOG(WARNING)
+          << "Paddle2ONNX do't support convert the Model， fall back to using "
+             "Paddle Inference.";
+    } else {
+      predictor_ = paddle::CreatePaddlePredictor<
+          Config, paddle::PaddleEngineKind::kONNXRuntime>(config);
+      return;
+    }
+#else
+    LOG(WARNING)
+        << "The onnxruntime backend isn't enabled,"
+           " and please re-compile Paddle with WITH_ONNXRUNTIME option,"
+           "fall back to using Paddle Inference.";
+#endif
+  }
   predictor_ = paddle::CreatePaddlePredictor<
       Config, paddle::PaddleEngineKind::kAnalysis>(config);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index a8e56101d37dabe8837b8adde9672ce45ffd62a0..21a7e9658bbeeb16d4cbff6364aaef68edcae16d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,10 @@
 #include <memory>
 #include <string>
 #include <vector>
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#endif
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
@@ -391,6 +395,53 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // fleet exe related
+
+  ///
+  /// \brief prepare for fleet executor to run
+  ///
+  /// Used in AnalysisPredictor::Init(),
+  ///
+  bool PrepareFleetExecutor();
+
+  ///
+  /// \brief init NCCL env for multi gpus inference
+  ///
+  /// Used in AnalysisPredictor::PrepareFleetExecutor()
+  ///
+  bool CommInit();
+
+  ///
+  /// \brief read the config to init NCCL env
+  ///
+  /// Used in AnalysisPredictor::CommInit()
+  ///
+  /// \param[in] ring_id_to_ranks: a ptr to ring_id_to_ranks
+  /// \param[in] rank_to_ring_ids: a ptr to rank_to_ring_ids
+  ///
+  bool LoadConverterConfig(
+      std::map<int64_t, std::vector<int64_t>> *ring_id_to_ranks,
+      std::map<int64_t, std::vector<int64_t>> *rank_to_ring_ids);
+
+  ///
+  /// \brief add ops and run them with NaiveExecutor to init NCCL env
+  ///
+  /// Used in AnalysisPredictor::CommInit()
+  ///
+  /// \param[in] tmp_var_name: var name to hold NCCL unique id
+  /// \param[in] nranks: number of ranks in one comm group
+  /// \param[in] rank: relative rank of current rank in the comm group
+  /// \param[in] peer_endpoints: group's peers' endpoints
+  /// \param[in] block: the block to insert comm ops
+  /// \param[in] ring_id: the ring id to be used to init NCCL env
+  ///
+  void InsertCommOp(std::string tmp_var_name, int nranks, int rank,
+                    const std::vector<std::string> &peer_endpoints,
+                    framework::BlockDesc *block, int ring_id);
+#endif
+
  private:
   AnalysisConfig config_;
   Argument argument_;
@@ -435,7 +486,15 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
-  int clone_num_{1};
+  static int clone_num_;
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // fleet executor related
+  distributed::FleetExecutorDesc executor_desc_;
+  std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
+  std::shared_ptr<distributed::TaskNode> task_node_;
+#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index a15a1cd84b14094c6ea95f94ffaaf31f4a790376..2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#if defined(PADDLE_WITH_CUDA)
+#include <cuda_runtime.h>
+#endif
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
@@ -354,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) {
 }
 #endif
 
+TEST(AnalysisPredictor, enable_onnxruntime) {
+  AnalysisConfig config;
+  config.EnableONNXRuntime();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.use_onnxruntime());
+#else
+  ASSERT_TRUE(!config.use_onnxruntime());
+#endif
+  config.EnableORTOptimization();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.ort_optimization_enabled());
+#else
+  ASSERT_TRUE(!config.ort_optimization_enabled());
+#endif
+  config.DisableONNXRuntime();
+  ASSERT_TRUE(!config.use_onnxruntime());
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -405,4 +426,91 @@ TEST(Predictor, Run) {
   predictor->TryShrinkMemory();
 }
 
+TEST(Predictor, EnableONNXRuntime) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  auto predictor = CreatePredictor(config);
+}
+
+TEST(Tensor, CpuShareExternalData) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  std::vector<std::vector<int64_t>> input_data(4, {0, 1, 2, 3});
+  w0->ShareExternalData<int64_t>(input_data[0].data(), {4, 1}, PlaceType::kCPU);
+  w1->ShareExternalData<int64_t>(input_data[1].data(), {4, 1}, PlaceType::kCPU);
+  w2->ShareExternalData<int64_t>(input_data[2].data(), {4, 1}, PlaceType::kCPU);
+  w3->ShareExternalData<int64_t>(input_data[3].data(), {4, 1}, PlaceType::kCPU);
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  auto out_shape = out->shape();
+  std::vector<float> out_data;
+  out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1,
+                                  std::multiplies<int>()));
+  out->ShareExternalData<float>(out_data.data(), out_shape, PlaceType::kCPU);
+
+  predictor->Run();
+
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+
+#if defined(PADDLE_WITH_CUDA)
+TEST(Tensor, GpuShareExternalData) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableUseGpu(100, 0);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  std::vector<std::vector<int64_t>> input_data(4, {0, 1, 2, 3});
+  std::vector<int64_t*> input_gpu(4, nullptr);
+
+  for (size_t i = 0; i < 4; ++i) {
+    cudaMalloc(reinterpret_cast<void**>(&input_gpu[i]), 4 * sizeof(int64_t));
+    cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t),
+               cudaMemcpyHostToDevice);
+  }
+
+  w0->ShareExternalData<int64_t>(input_gpu[0], {4, 1}, PlaceType::kGPU);
+  w1->ShareExternalData<int64_t>(input_gpu[1], {4, 1}, PlaceType::kGPU);
+  w2->ShareExternalData<int64_t>(input_gpu[2], {4, 1}, PlaceType::kGPU);
+  w3->ShareExternalData<int64_t>(input_gpu[3], {4, 1}, PlaceType::kGPU);
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  auto out_shape = out->shape();
+  float* out_data;
+  auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1,
+                                  std::multiplies<int>()) *
+                  sizeof(float);
+  cudaMalloc(reinterpret_cast<void**>(out_data), out_size * sizeof(float));
+  out->ShareExternalData<float>(out_data, out_shape, PlaceType::kGPU);
+
+  predictor->Run();
+
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+#endif
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index d03840ada36bce8cfdc2213284697e6d873cbde0..df98a7b05cf3f2035e9a21ec10e4b44eca843bbd 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -4,6 +4,7 @@ option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL.
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -151,6 +159,17 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -213,6 +232,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef5c08cd041eb7af4c7f17a95c4fd9b8601e4bad
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <vector>
+#include "gflags/gflags.h"
+#include "utils.h"  // NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+
+namespace paddle {
+namespace demo {
+
+/*
+ * Use the onnxruntime engine to inference the demo.
+ */
+void Main() {
+  paddle::AnalysisConfig config;
+  config.EnableONNXRuntime();
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  auto predictor = paddle_infer::CreatePredictor(config);
+
+  // Inference.
+  std::vector<int> input_shape = {1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(input_shape);
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  predictor->Run();
+  output_tensor->CopyToCpu(out_data.data());
+
+  VLOG(3) << "output.size " << out_data.size();
+}
+
+}  // namespace demo
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 5f062e8063253a08466b2491e80417af07047394..79a31555c7f0b1cb4a8d9c48bae16145d605935b 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 USE_TENSORRT=$5
 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr
-MSVC_STATIC_CRT=$7
+WITH_ONNXRUNTIME=$7
+MSVC_STATIC_CRT=$8
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
@@ -38,6 +39,26 @@ else
   use_gpu_list='false'
 fi
 
+mkdir -p $DATA_DIR
+cd $DATA_DIR
+
+if [ $7 == ON ]; then
+  ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB}
+  PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB}
+  #download model
+  mkdir -p MobileNetV2
+  cd MobileNetV2
+  if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
+    echo "MobileNetV2.inference.model.tar.gz has been downloaded."
+  else
+    wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    tar xzf *.tar.gz
+  fi
+  cd ..
+fi
+
 PREFIX=inference-vis-demos%2F
 URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
@@ -58,8 +79,7 @@ function download() {
   fi
   cd ..
 }
-mkdir -p $DATA_DIR
-cd $DATA_DIR
+
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
   download $vis_demo_name
@@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
@@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
       Release/trt_mobilenet_demo.exe \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
     if [ -d $word2vec_model ]; then
@@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_GPU=$TEST_GPU_CPU \
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       make -j$(nproc)
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do
         exit 1
       fi
     fi
+
+    # --------onnxruntime mobilenetv2 on linux/mac------
+    if [ $WITH_ONNXRUNTIME == ON ]; then
+      rm -rf *
+      cmake .. -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=onnxruntime_mobilenet_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
+      make -j$(nproc)
+      ./onnxruntime_mobilenet_demo \
+        --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
+      if [ $? -ne 0 ]; then
+        echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail."
+        exit 1
+      fi
+    fi
   fi
 done
 set +x
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 1d09b01f8f852f2bb7f668d0e2b4ee3250c9cc64..18b1d09f0e8a7c4be9862991060a4706ee7cde7e 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/allocator.h"
 
 namespace paddle_infer {
 
@@ -205,6 +206,73 @@ void Tensor::CopyFromCpu(const T *data) {
   }
 }
 
+template <typename T>
+struct DataTypeInfo;
+
+template <>
+struct DataTypeInfo<float> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT32;
+};
+
+template <>
+struct DataTypeInfo<float16> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT16;
+};
+
+template <>
+struct DataTypeInfo<int64_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT64;
+};
+
+template <>
+struct DataTypeInfo<int8_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT8;
+};
+
+template <>
+struct DataTypeInfo<uint8_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::UINT8;
+};
+
+template <>
+struct DataTypeInfo<int32_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT32;
+};
+
+paddle::experimental::DataLayout LayoutConvert(DataLayout layout) {
+  PADDLE_ENFORCE_EQ(
+      layout, DataLayout::kNCHW,
+      paddle::platform::errors::InvalidArgument("Only NCHW is supported now."));
+  return paddle::experimental::DataLayout::NCHW;
+}
+
+template <typename T>
+void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape,
+                               PlaceType place, DataLayout layout) {
+  EAGER_GET_TENSOR(paddle::framework::LoDTensor)
+  size_t size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  phi::DenseTensorMeta meta(DataTypeInfo<T>().TYPE, phi::make_ddim(shape),
+                            LayoutConvert(layout));
+  if (place == PlaceType::kCPU) {
+    phi::DenseTensor dtensor(
+        std::make_shared<phi::Allocation>(const_cast<T *>(data), size,
+                                          paddle::platform::CPUPlace()),
+        meta);
+    *tensor = std::move(dtensor);
+  } else if (place == PlaceType::kGPU) {
+    phi::DenseTensor dtensor(
+        std::make_shared<phi::Allocation>(const_cast<T *>(data), size,
+                                          paddle::platform::CUDAPlace(device_)),
+        meta);
+    *tensor = std::move(dtensor);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "PlaceType must be PlaceType::kCPU or PlaceType::kGPU."));
+  }
+}
+
 void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
   EAGER_GET_TENSOR(paddle_infer::Strings);
   PADDLE_ENFORCE_GE(tensor->size(), 0,
@@ -334,6 +402,25 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
 
+template PD_INFER_DECL void Tensor::ShareExternalData<float>(
+    const float *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<int64_t>(
+    const int64_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<int32_t>(
+    const int32_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<uint8_t>(
+    const uint8_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<int8_t>(
+    const int8_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
+    const float16 *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+
 template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee82da139d8f39c26002763c4a4835050c48fc99
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -0,0 +1,354 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid//platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+framework::proto::VarType::Type ConvertONNXType(
+    ONNXTensorElementDataType type) {
+  switch (type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return framework::proto::VarType::FP32;
+    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+    //   return DataType::FP16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+      return framework::proto::VarType::INT8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return framework::proto::VarType::INT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return framework::proto::VarType::INT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+      return framework::proto::VarType::UINT8;
+    default:
+      LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
+      return framework::proto::VarType::FP32;
+  }
+}
+
+bool CheckConvertToONNX(const AnalysisConfig &config) {
+  if (!config.model_dir().empty()) {
+    LOG(ERROR) << "Paddle2ONNX not support model_dir config";
+    // TODO(heliqi jiangjiajun): Paddle2ONNX not support
+    // config.model_dir() + "/__model__"
+    // config.model_dir() + var_name
+    return false;
+  } else if (config.prog_file().empty() || config.params_file().empty()) {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s' or params path '%s'.",
+        config.model_dir(), config.prog_file(), config.params_file());
+    return false;
+  }
+  return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
+                                   config.model_from_memory());
+}
+
+bool ONNXRuntimePredictor::Init() {
+  VLOG(3) << "ONNXRuntime Predictor::init()";
+
+  // Now ONNXRuntime only suuport CPU
+  if (config_.use_gpu()) {
+    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  scope_.reset(new paddle::framework::Scope());
+  sub_scope_ = &scope_->NewScope();
+
+  std::string onnx_proto;
+  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
+                      config_.model_from_memory());
+
+  Ort::SessionOptions session_options;
+  if (config_.ort_optimization_enabled()) {
+    session_options.SetGraphOptimizationLevel(
+        GraphOptimizationLevel::ORT_ENABLE_ALL);
+  }
+  // Turn optimization off first, and then turn it on when it's stable
+  // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+  // session_options.EnableCpuMemArena();
+  // session_options.EnableMemPattern();
+  // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads());
+  session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads());
+  VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads();
+  if (config_.profile_enabled()) {
+    LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the "
+                    "performance";
+#if defined(_WIN32)
+    session_options.EnableProfiling(L"ONNX");
+#else
+    session_options.EnableProfiling("ONNX");
+#endif
+  } else {
+    VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report "
+               "will be "
+               "generated.";
+  }
+  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+
+  auto memory_info =
+      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::Allocator allocator(session_, memory_info);
+
+  framework::proto::VarType::Type proto_type =
+      framework::proto::VarType::LOD_TENSOR;
+  size_t n_inputs = session_.GetInputCount();
+  for (size_t i = 0; i < n_inputs; ++i) {
+    auto input_name = session_.GetInputName(i, allocator);
+    auto type_info = session_.GetInputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
+    auto *ptr = scope_->Var(input_name);
+    framework::InitializeVariable(ptr, proto_type);
+    allocator.Free(input_name);
+  }
+
+  size_t n_outputs = session_.GetOutputCount();
+  for (size_t i = 0; i < n_outputs; ++i) {
+    auto output_name = session_.GetOutputName(i, allocator);
+    auto type_info = session_.GetOutputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
+    auto *ptr = scope_->Var(output_name);
+    framework::InitializeVariable(ptr, proto_type);
+    allocator.Free(output_name);
+  }
+
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig &config) {
+  if (config.glog_info_disabled()) {
+    FLAGS_logtostderr = 1;
+    FLAGS_minloglevel = 2;  // GLOG_ERROR
+  }
+
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
+
+  VLOG(3) << "create ONNXRuntimePredictor";
+
+  std::unique_ptr<PaddlePredictor> predictor(new ONNXRuntimePredictor(config));
+  // Each config can only be used for one predictor.
+  config.SetInValid();
+  auto predictor_p = dynamic_cast<ONNXRuntimePredictor *>(predictor.get());
+
+  if (!predictor_p->Init()) {
+    return nullptr;
+  }
+
+  return predictor;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto input_desc : input_desc_) {
+    input_names.push_back(input_desc.name);
+  }
+  return input_names;
+}
+
+std::map<std::string, std::vector<int64_t>>
+ONNXRuntimePredictor::GetInputTensorShape() {
+  std::map<std::string, std::vector<int64_t>> input_shapes;
+  for (auto input_desc : input_desc_) {
+    input_shapes[input_desc.name] = input_desc.shape;
+  }
+  return input_shapes;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto output_desc : output_desc_) {
+    output_names.push_back(output_desc.name);
+  }
+  return output_names;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The in variable named %s is not found in the "
+                              "scope of the ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  return res;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The out variable named %s is not found in the "
+                              "scope of the ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  return res;
+}
+
+Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
+                                             const char *device_name) {
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  size_t size =
+      tensor->numel() *
+      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
+  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
+  return Ort::Value::CreateTensor(memory_info,
+                                  static_cast<void *>(tensor->data()), size,
+                                  shape.data(), shape.size(), desc.dtype);
+}
+
+void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
+                                    const ONNXDesc &desc) {
+  auto info = value.GetTensorTypeAndShapeInfo();
+
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(phi::make_ddim(info.GetShape()));
+  auto dtype = ConvertONNXType(info.GetElementType());
+  auto *ptr = tensor->mutable_data(place_, dtype);
+
+  if (platform::is_cpu_place(place_)) {
+    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
+                tensor->numel() * framework::SizeOfType(dtype));
+  } else {
+    auto src_place = place_;
+    auto dst_place = place_;
+    memory::Copy(dst_place, ptr, src_place,
+                 const_cast<void *>(value.GetTensorData<void>()),
+                 tensor->numel() * framework::SizeOfType(dtype));
+  }
+}
+
+bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                               std::vector<PaddleTensor> *output_data,
+                               int batch_size) {
+  LOG(ERROR) << "Not support Run";
+  return false;
+}
+
+bool ONNXRuntimePredictor::ZeroCopyRun() {
+  try {
+    Ort::IoBinding binding(session_);
+    std::vector<Ort::Value> inputs;
+    std::vector<Ort::Value> outputs;
+    Ort::RunOptions options;
+
+    inputs.reserve(input_desc_.size());
+    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
+    for (auto desc : input_desc_) {
+      inputs.push_back(GetOrtValue(desc, device_name));
+      binding.BindInput(desc.name.c_str(), inputs.back());
+    }
+
+    // TODO(heliqi): Optimization —— move to  Init()
+    for (auto desc : output_desc_) {
+      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                                  place_.GetDeviceId(), OrtMemTypeDefault);
+      binding.BindOutput(desc.name.c_str(), memory_info);
+    }
+
+    session_.Run({}, binding);
+
+    outputs = binding.GetOutputValues();
+    for (size_t i = 0; i < output_desc_.size(); ++i) {
+      AsTensor(outputs[i], output_desc_[i]);
+    }
+  } catch (const std::exception &e) {
+    LOG(ERROR) << e.what();
+    return false;
+  }
+
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone() {
+  LOG(ERROR) << "Not support Clone(), Please create new Predictor";
+  return nullptr;
+}
+
+uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
+  return paddle::memory::Release(place_);
+}
+
+ONNXRuntimePredictor::~ONNXRuntimePredictor() {
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+  memory::Release(place_);
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fb07aa97bd2746773192456ddeba941a24e8906
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -0,0 +1,225 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_compatible_info.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#include "paddle2onnx/converter.h"
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
+
+///
+/// \file onnxruntime_predictor.h
+///
+/// \brief A predictor using ONNXRuntime
+///
+/// \author heliqi@baidu.com
+/// \date 2022-02-14
+/// \since 2.3.0
+///
+
+namespace paddle {
+
+bool CheckConvertToONNX(const AnalysisConfig &config);
+
+struct ONNXDesc {
+  std::string name;
+  std::vector<int64_t> shape;
+  ONNXTensorElementDataType dtype;
+};
+
+///
+/// \class ONNXRuntimePredictor
+///
+/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePaddlePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output tensors
+/// \code{cpp}
+///   auto input_t = predictor->GetInputTensor(input_names[0]);
+///   auto output_t = predictor->GetOutputTensor(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->ZeroCopyRun();
+/// \endcode
+///
+class ONNXRuntimePredictor : public PaddlePredictor {
+ public:
+  ///
+  /// \brief Construct a new ONNXRuntime Predictor object
+  ///
+  /// \param[in] AnalysisConfig config
+  ///
+  explicit ONNXRuntimePredictor(const AnalysisConfig &config)
+      : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
+  }
+  ///
+  /// \brief Destroy the ONNXRuntime Predictor object
+  ///
+  ~ONNXRuntimePredictor();
+
+  ///
+  /// \brief Initialize predictor
+  ///
+  /// \return Whether the init function executed successfully
+  ///
+  bool Init();
+
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
+  std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
+  std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+  ///
+  /// \brief Get all input names and their corresponding shapes
+  ///
+  /// \return the map of input names and shapes
+  ///
+  std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
+
+  /// Not supoort
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
+
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
+  bool ZeroCopyRun() override;
+
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  std::shared_ptr<framework::Scope> scope_;
+
+ private:
+  ///
+  /// \brief get the Ort Value(input Tensor).
+  ///
+  /// \param[in] desc ONNXDesce(name、shape、dtype)
+  ///
+  /// \param[in] device_name "cpu" or "gpu" of device
+  ///
+  /// \return get a Ort::Value
+  ///
+  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
+
+  ///
+  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  ///
+  /// \param[in] value Ort::Value(output Tensor)
+  ///
+  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  ///
+  /// \return get a Ort::Value
+  ///
+  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+
+ private:
+  AnalysisConfig config_;
+
+  // ONNXRuntime
+  Ort::Env env_;
+  Ort::Session session_{nullptr};
+
+  platform::Place place_;
+  framework::Scope *sub_scope_{nullptr};
+  std::vector<ONNXDesc> input_desc_;
+  std::vector<ONNXDesc> output_desc_;
+  int predictor_id_;
+
+// Some more detailed tests, they are made the friends of the predictor, so that
+// the all the details can be tested.
+#if PADDLE_WITH_TESTING
+  FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on);
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2be2de9c60bb1c3fdedf13212d50a6f4e155d4df
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+DEFINE_string(dirname, "", "dirname to tests.");
+
+namespace paddle {
+
+TEST(ONNXRuntimePredictor, onnxruntime_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname + "/inference.pdmodel",
+                  FLAGS_dirname + "/inference.pdiparams");
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  config.SetCpuMathLibraryNumThreads(2);
+  LOG(INFO) << config.Summary();
+
+  auto _predictor =
+      CreatePaddlePredictor<AnalysisConfig,
+                            paddle::PaddleEngineKind::kONNXRuntime>(config);
+  ASSERT_TRUE(_predictor);
+  auto* predictor = static_cast<ONNXRuntimePredictor*>(_predictor.get());
+
+  ASSERT_TRUE(predictor);
+  ASSERT_TRUE(!predictor->Clone());
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // Dummy Input Data
+  std::vector<int64_t> input_shape = {-1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+
+  // testing all interfaces
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto get_input_shape = predictor->GetInputTensorShape();
+
+  ASSERT_EQ(input_names.size(), 1UL);
+  ASSERT_EQ(output_names.size(), 1UL);
+  ASSERT_EQ(input_names[0], "inputs");
+  ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1");
+  ASSERT_EQ(get_input_shape["inputs"], input_shape);
+
+  auto input_tensor = predictor->GetInputTensor(input_names[0]);
+  input_tensor->Reshape({1, 3, 224, 224});
+  auto output_tensor = predictor->GetOutputTensor(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+  output_tensor->CopyToCpu(out_data.data());
+
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 180c028c6a61088edeb8723891d4de1ba2272b80..7b765e3fa8a24ef1b81b68da8ba12dd8e5577572 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -76,6 +76,54 @@ struct LiteNNAdapterConfig {
   LiteNNAdapterConfig& Disable();
 };
 
+struct DistConfig {
+  bool use_dist_model() const { return use_dist_model_; }
+  void EnableDistModel(bool use_dist_model) {
+    use_dist_model_ = use_dist_model;
+  }
+
+  std::vector<std::string> trainer_endpoints() const {
+    return trainer_endpoints_;
+  }
+
+  std::string current_endpoint() const { return current_endpoint_; }
+
+  void SetEndpoints(const std::vector<std::string>& trainer_endpoints,
+                    const std::string& current_endpoint) {
+    trainer_endpoints_ = trainer_endpoints;
+    current_endpoint_ = current_endpoint;
+  }
+
+  int64_t nranks() const { return nranks_; }
+
+  int64_t rank() const { return rank_; }
+
+  void SetRanks(int64_t nranks, int64_t rank) {
+    nranks_ = nranks;
+    rank_ = rank;
+  }
+
+  std::string comm_init_config() const { return comm_init_config_; }
+
+  void SetCommInitConfig(const std::string& comm_init_config) {
+    comm_init_config_ = comm_init_config;
+  }
+
+  void SetCarrierId(const std::string& carrier_id) { carrier_id_ = carrier_id; }
+
+  std::string carrier_id() const { return carrier_id_; }
+
+ protected:
+  // DistModel Inference related
+  bool use_dist_model_{false};  // whether use DistModel or not
+  std::vector<std::string> trainer_endpoints_{};  // all trainers' endpoints
+  std::string current_endpoint_{};                // current trainer's endpoint
+  int64_t nranks_{1};               // total ranks (number of trainers)
+  int64_t rank_{0};                 // rank
+  std::string comm_init_config_{};  // converter config path
+  std::string carrier_id_{"inference"};
+};
+
 ///
 /// \brief configuration manager for AnalysisPredictor.
 /// \since 1.7.0
@@ -271,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableNpu(int device_id = 0);
   ///
+  /// \brief Turn on ONNXRuntime.
+  ///
+  void EnableONNXRuntime();
+  ///
+  /// \brief Turn off ONNXRuntime.
+  ///
+  void DisableONNXRuntime();
+  ///
+  /// \brief Turn on ONNXRuntime Optimization.
+  ///
+  void EnableORTOptimization();
+  ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
   /// \return bool Whether the GPU is turned on.
@@ -294,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool use_ipu() const { return use_ipu_; }
   ///
+  /// \brief A boolean state telling whether the ONNXRuntime is turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime is turned on.
+  ///
+  bool use_onnxruntime() const { return use_onnxruntime_; }
+  ///
+  /// \brief A boolean state telling whether the ONNXRuntime Optimization is
+  /// turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime Optimization is turned on.
+  ///
+  bool ort_optimization_enabled() const { return enable_ort_optimization_; }
+  ///
   /// \brief Get the GPU device id.
   ///
   /// \return int The GPU device id.
@@ -763,6 +836,12 @@ struct PD_INFER_DECL AnalysisConfig {
 
   LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; }
 
+  void SetDistConfig(const DistConfig& dist_config) {
+    dist_config_ = dist_config;
+  }
+
+  const DistConfig& dist_config() const { return dist_config_; }
+
  protected:
   // Update the config.
   void Update();
@@ -787,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_npu_{false};
   int npu_device_id_{0};
 
+  // ONNXRuntime related
+  bool use_onnxruntime_{false};
+  bool enable_ort_optimization_{false};
+
   // Padding related
   bool use_fc_padding_{true};
 
@@ -902,6 +985,9 @@ struct PD_INFER_DECL AnalysisConfig {
   mutable bool is_valid_{true};
   std::string opt_cache_dir_;
   friend class paddle_infer::experimental::InternalUtils;
+
+  // fleet exe related
+  DistConfig dist_config_{};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index c129efe494b4fb36bc72d3c93e24951ba7fef322..657dd9b600cce7173e3aa8d0156ba0975199cf98 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
 
  private:
   friend class AnalysisPredictor;
+  friend class ONNXRuntimePredictor;
   explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {}
 };
 
@@ -381,6 +382,7 @@ enum class PaddleEngineKind {
   kNative = 0,         ///< Use the native Fluid facility.
   kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
   kAnalysis,           ///< More optimization.
+  kONNXRuntime,        ///< Use ONNXRuntime
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
@@ -395,6 +397,11 @@ template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
 
+template <>
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig& config);
+
 PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);
 
 PD_INFER_DECL std::string get_version();
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 313e1f2faea553809cb6fce66ca9a751bace8d75..f5f36d805b43ea0815683e3b65bf157fe5beb2de 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
-  "conv_affine_channel_fuse_pass",  //
-      "adaptive_pool2d_convert_global_pass",
-      "conv_eltwiseadd_affine_channel_fuse_pass",  //
-      "shuffle_channel_detect_pass",               //
-      "quant_conv2d_dequant_fuse_pass",            //
-      "delete_quant_dequant_op_pass",              //
-      "delete_quant_dequant_filter_op_pass",       //
+  "adaptive_pool2d_convert_global_pass",
+      "shuffle_channel_detect_pass",          //
+      "quant_conv2d_dequant_fuse_pass",       //
+      "delete_quant_dequant_op_pass",         //
+      "delete_quant_dequant_filter_op_pass",  //
       // "fc_fuse_pass",                        //
       "simplify_with_basic_ops_pass",                 //
       "embedding_eltwise_layernorm_fuse_pass",        //
@@ -134,22 +132,20 @@ const std::vector<std::string> kLiteSubgraphPasses({
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     //   "identity_scale_op_clean_pass",             //
-    "is_test_pass",                                  //
-        "simplify_with_basic_ops_pass",              //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_eltwiseadd_bn_fuse_pass",              //
-        "embedding_eltwise_layernorm_fuse_pass",     //
-        "multihead_matmul_fuse_pass_v2",             //
-        "gpu_cpu_squeeze2_matmul_fuse_pass",         //
-        "gpu_cpu_reshape2_matmul_fuse_pass",         //
-        "gpu_cpu_flatten2_matmul_fuse_pass",         //
-        "gpu_cpu_map_matmul_v2_to_mul_pass",         //
-        "gpu_cpu_map_matmul_v2_to_matmul_pass",      //
-        "gpu_cpu_map_matmul_to_mul_pass",            //
-        "fc_fuse_pass",                              //
-        "fc_elementwise_layernorm_fuse_pass",        //
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        "fc_fuse_pass",                           //
+        "fc_elementwise_layernorm_fuse_pass",     //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
@@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",     //
-             "conv_bn_fuse_pass",              // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
-             "conv_affine_channel_fuse_pass",  //
-             "conv_eltwiseadd_affine_channel_fuse_pass",  //
-             "conv_transpose_bn_fuse_pass",               //
-             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
-             "conv_bias_mkldnn_fuse_pass",                //
+             "depthwise_conv_mkldnn_pass",    //
+             "conv_bn_fuse_pass",             // Execute BN passes again to
+             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
+             "conv_transpose_bn_fuse_pass",   //
+             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
+             "conv_bias_mkldnn_fuse_pass",              //
              "conv_transpose_bias_mkldnn_fuse_pass",
              // TODO(baoachun): Need to support 5-dimensional input.
              // "conv3d_bias_mkldnn_fuse_pass",  //
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 81eecbb2c1480499b81556c48d021a8ff8929899..5a98d109aed79cc5bcefdc01b47a166bdf9c01d9 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -47,6 +47,8 @@ enum DataType {
 
 enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };
 
+enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW };
+
 /// \brief Represents an n-dimensional array of values.
 /// The Tensor is used to store the input or output of the network.
 /// Zero copy means that the tensor supports direct copy of host or device data
@@ -92,6 +94,17 @@ class PD_INFER_DECL Tensor {
   template <typename T>
   void CopyFromCpu(const T* data);
 
+  /// \brief Share the data with tensor data.
+  /// It's usually used to set the tensor data.
+  /// \param data The pointer of the data, from which the tensor will share.
+  /// \param shape The shape of data.
+  /// \param place The place of data.
+  /// \param layout The layout of data. Only NCHW is supported now.
+  template <typename T>
+  void ShareExternalData(const T* data, const std::vector<int>& shape,
+                         PlaceType place,
+                         DataLayout layout = DataLayout::kNCHW);
+
   /// \brief Experimental interface.
   /// It's usually used to set the input tensor data with Strings data type.
   /// \param data The pointer of the data, from which the tensor will copy.
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index e342190fda1aca53a6814806e1afec1335224d79..d7b07652babbd1e24e2c650ac8ac079f03523d12 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
   return config->use_gpu();
 }
 
+void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableONNXRuntime();
+}
+
+void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableONNXRuntime();
+}
+
+PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_onnxruntime();
+}
+
+void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableORTOptimization();
+}
+
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                         int32_t l3_workspace_size, PD_Bool locked,
                         PD_Bool autotune, const char* autotune_file,
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index c314aca918f141d30661d9034656899bbb816063..f6b754cad213f8d5249317468b5ceb21e863f6ad 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
     __pd_keep PD_Config* pd_config);
 ///
+/// \brief Turn on ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the ONNXRutnime is turned on.
+///
+/// \return Whether the ONNXRuntime is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Turn on XPU.
 ///
 /// \param[in] pd_onfig config
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index def26913b0a1c082b3a983cea5fa8021c468b59c..8f9f34c06b4768317d6f710ac49a7610a9ef9d6a 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
 	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
 }
 
+///
+/// \brief Turn on ONNXRuntime.
+///
+func (config *Config) EnableONNXRuntime() {
+	C.PD_ConfigEnableONNXRuntime(config.c)
+}
+
+///
+/// \brief Turn off ONNXRuntime.
+///
+func (config *Config) DisableONNXRuntime() {
+	C.PD_ConfigDisableONNXRuntime(config.c)
+}
+
+///
+/// \brief A boolean state telling whether the ONNXRuntime is turned on.
+///
+/// \return bool Whether the ONNXRuntime is turned on.
+///
+func (config *Config) ONNXRuntimeEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c))
+}
+
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+func (config *Config) EnableORTOptimization() {
+	C.PD_ConfigEnableORTOptimization(config.c)
+}
+
 ///
 /// \brief Turn on XPU.
 ///
diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go
index b82161880839e500a20b787914e2827da151106b..297841dcbcf6c19aef4a536557ec30e76ea9f82c 100644
--- a/paddle/fluid/inference/goapi/config_test.go
+++ b/paddle/fluid/inference/goapi/config_test.go
@@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) {
 
 	config.SetBfloat16Op([]string{"fc", "mul"})
 }
+
+func TestONNXRuntime(t *testing.T) {
+	config := NewConfig()
+	config.SetModelDir("modelDir")
+	t.Log(config.ModelDir())
+
+	config.EnableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.DisableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.EnableORTOptimization()
+
+	config.SetCpuMathLibraryNumThreads(4)
+	t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
+}
\ No newline at end of file
diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go
index 40e518304510c57fec9cd7609ecbd6eefa456050..755558f96238d11842f8245c2b36210c60d8a057 100644
--- a/paddle/fluid/inference/goapi/predictor_test.go
+++ b/paddle/fluid/inference/goapi/predictor_test.go
@@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) {
 	cloned.ClearIntermediateTensor()
 }
 
+func TestONNXRuntimePredictor(t *testing.T) {
+	t.Logf("Version:\n%+v", Version())
+	config := NewConfig()
+	config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams")
+	config.EnableONNXRuntime()
+	config.EnableORTOptimization()
+	predictor := NewPredictor(config)
+	inNames := predictor.GetInputNames()
+	t.Logf("InputNames:%+v", inNames)
+	outNames := predictor.GetOutputNames()
+	t.Logf("OutputNames:%+v", outNames)
+
+	inHandle := predictor.GetInputHandle(inNames[0])
+	inHandle.Reshape([]int32{1, 3, 224, 224})
+	t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape())
+
+	data := make([]float32, numElements([]int32{1, 3, 224, 224}))
+	for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
+		data[i] = float32(i%255) * 0.1
+	}
+	inHandle.CopyFromCpu(data)
+	t.Logf("inHandle Type:%+v", inHandle.Type())
+
+	predictor.Run()
+
+	outHandle := predictor.GetOutputHandle(outNames[0])
+	t.Logf("outHandle name:%+v", outHandle.Name())
+
+	outShape := outHandle.Shape()
+	t.Logf("outHandle Shape:%+v", outShape)
+	outData := make([]float32, numElements(outShape))
+	outHandle.CopyToCpu(outData)
+	t.Log(outData)
+}
+
+
 func TestFromBuffer(t *testing.T) {
 	modelFile, err := os.Open("./mobilenetv1/inference.pdmodel")
 	if err != nil {
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
index edccc2648c012fda9e22c2fc14ffe4f90dc26cfe..cff9fd4aa7ceada2a37d9650c9ce3653f0155447 100644
--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -22,6 +22,7 @@ fi
 
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
 
 # 3. go test
 go clean -testcache
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 8c61200f7f57cdf57b372c37c8f7cea40c4a8d4c..b69292827aa136fd1d8a1f66d80823e6344a6174 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index a432ff62810aa30c01c1980c80bf3f344039f7dd..f19b21d3e632633d7066c3e9e14cadd2900eb339 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -335,15 +335,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
         reshape_before_fc_dim.d[4] = 1;
         auto* reshape_before_fc_layer =
             TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        if (enable_int8) {
+          engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
+                                         in_scale);
+        }
         reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
         reshape_before_fc_layer->setName(
             ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
                 .c_str());
 
         // add layer fc
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
-            weight.get(), bias.get());
+        nvinfer1::ILayer* fc_layer = nullptr;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Convolution, *reshape_before_fc_layer->getOutput(0), n,
+              nv_ksize, weight.get(), bias.get());
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+              n, weight.get(), bias.get());
+        }
+
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("fc_out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+        }
         fc_layer->setName(
             ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
 
@@ -359,6 +381,10 @@ class MultiheadMatMulOpConverter : public OpConverter {
         plugin_inputs.push_back(input_bias_qk);
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+
+        if (enable_int8) {
+          with_fp16 = 1;
+        }
         plugin::DynamicPluginTensorRT* plugin =
             new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
                                                   head_size, scale, with_fp16);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 71c4348685e1b01e158aa298c48953fc3a354cec..753cd70727643d660bb1ffd3607706613f595c78 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -105,7 +105,7 @@ class SkipLayerNormOpConverter : public OpConverter {
                               "in CustomSkipLayerNormPluginDynamic hidden "
                               "dimension should > 0"));
         if (enable_int8) {
-          type = static_cast<int>(nvinfer1::DataType::kINT8);
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
         }
 
         const std::vector<nvinfer1::PluginField> fields{
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index f2dc5ba1c7c2c832e0239f6a30760c354aaf4699..7f7313fbcb5969aafea47ad23248acd5a6ca3644 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index 95916746d6fcb528d26a8f8bb39980b55c4f3704..b96992ef8514abe0f71dbf23d38abb626f6c4a5b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP(conv2d_transpose);
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index 474fd92071fb0795b868f0cd86591061cf8b6581..cf377396087637f115523ddc60a468e2a23d57d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b2764ca61c11219e5546867813157b7f05ee3ce8..d53a8923af6120adb460d95fc81820b6dfa03a60 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -54,6 +54,8 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
       return TRT_DT::kFLOAT;
     case FluidDT::VarType_Type_INT32:
       return TRT_DT::kINT32;
+    case FluidDT::VarType_Type_FP16:
+      return TRT_DT::kHALF;
     default:
       return TRT_DT::kINT32;
   }
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index 57177cfa8b421e1d79004bb1a7f738d98dc00f97..336005d883b0f523213060645e540c35a14e4e9c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -16,7 +16,6 @@
 #include <cassert>
 
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 37214534f3c937bcf62bb34b51da2c934c566ced..8c96499a022f7e9f0d1fd8c512070592cf7428ff 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -453,6 +453,23 @@ if(WITH_MKLDNN)
   download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10)
 
+ # mobilenetv3_large_x1_0 int8
+ set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large")
+ set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar")
+ if (NOT EXISTS ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME})
+    inference_download_and_uncompress_without_verify(${INT8_MOBILENETV3_LARGE_MODEL_DIR} "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" ${INT8_MOBILENETV3_FILE_NAME})
+ endif()
+ inference_analysis_test_run(test_analyzer_int8_mobilenetv3_large 
+    COMMAND ${INT8_IMG_CLASS_TEST_APP} 
+    ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
+        --infer_data=${IMAGENET_DATA_PATH} 
+        --warmup_batch_size=50
+        --batch_size=1
+        --enable_int8=true 
+        --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} 
+        --iterations=100
+        --with_accuracy_layer=false)
+
   ### BFLOAT16 tests
 
   # build test binary to be used in subsequent tests
@@ -472,6 +489,17 @@ if(WITH_MKLDNN)
   # mobilenetv2 bfloat16
   inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
+  # mobilenetv3_large 
+  inference_analysis_test_run(test_analyzer_bfloat16_mobilenetv3_large
+  COMMAND ${BF16_IMG_CLASS_TEST_APP}
+      ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
+        --infer_data=${IMAGENET_DATA_PATH} 
+        --batch_size=1
+        --enable_bf16=true
+        --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+        --iterations=100
+        --with_accuracy_layer=false)
+
   ### Object detection models
   set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin")
   set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection")
@@ -692,6 +720,12 @@ inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zeroco
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
 
+if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+    inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${OCR_INSTALL_DIR}/model)
+endif()
+
 inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
@@ -739,6 +773,7 @@ if(WITH_MKLDNN)
     set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120)
 endif()
 
 set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
index 3b16b0d34fd4cb87879bb6ed585e72b48167ac2c..f267f0f28d685e51f0359a345c52fbbe4a49fa16 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
@@ -14,13 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
+  std::ifstream model_file(FLAGS_infer_model + "/__model__");
+  if (model_file.good())
+    cfg->SetModel(FLAGS_infer_model);
+  else
+    cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel",
+                  FLAGS_infer_model + "/inference.pdiparams");
   cfg->DisableGpu();
   cfg->SwitchIrOptim();
   cfg->SwitchSpecifyInputNames();
@@ -38,7 +44,12 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
   // read data from file and prepare batches with test data
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInputs(&input_slots_all);
-  b_cfg.EnableMkldnnBfloat16();
+  if (FLAGS_enable_bf16 &&
+      platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) {
+    b_cfg.EnableMkldnnBfloat16();
+  } else {
+    FLAGS_enable_bf16 = false;
+  }
   CompareBFloat16AndAnalysis(&cfg, &b_cfg, input_slots_all);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
index df0eb58c2bd587e69215602512cc51f19c97a978..a341ffd7a081c24500e3b061b0ce3510a2aaacbc 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -81,6 +81,18 @@ TEST(PD_Config, interface) {
   PD_ConfigSetBfloat16Op(config, 1, &ops_name);
 #endif
 
+  PD_ConfigEnableONNXRuntime(config);
+  bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config);
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  EXPECT_TRUE(onnxruntime_enabled);
+#else
+  EXPECT_FALSE(onnxruntime_enabled);
+#endif
+  PD_ConfigDisableONNXRuntime(config);
+  bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config);
+  EXPECT_FALSE(onnxruntime_disabled);
+  PD_ConfigEnableORTOptimization(config);
+
   PD_ConfigEnableMemoryOptim(config, true);
   bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
   EXPECT_TRUE(memory_enabled);
diff --git a/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7cf6e2adfc688f70e0ed31f7c1f5305206aa1702
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+
+TEST(test_dist_model, dist_model) {
+  std::cout << "Analysis Predictor DistModel test." << std::endl;
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model + "/__model__",
+                  FLAGS_infer_model + "/__params__");
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableUseGpu(100, 0);
+  DistConfig dist_config;
+  dist_config.SetRanks(1, 0);
+  dist_config.EnableDistModel(true);
+  dist_config.SetEndpoints({""}, "");
+  config.SetDistConfig(dist_config);
+
+  auto predictor = paddle_infer::CreatePredictor(config);
+  int batch_size = 1;
+  int channels = 1;
+  int height = 48;
+  int width = 512;
+  int nums = batch_size * channels * height * width;
+  std::cout << "Created predictor." << std::endl;
+
+  float* input = new float[nums];
+  for (int i = 0; i < nums; ++i) input[i] = 0;
+  auto input_names = predictor->GetInputNames();
+
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape({batch_size, channels, height, width});
+  input_t->CopyFromCpu(input);
+  std::cout << "Input data." << std::endl;
+
+  predictor->Run();
+  std::cout << "Zero Copy Run." << std::endl;
+
+  std::vector<float> out_data;
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  std::cout << "Output data." << std::endl;
+  delete[] input;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index 8f8b73044232a5cacfa3609e5f8e32ccf375d418..b07163b518b529e7ab01107e1f0d217443f574bd 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -22,7 +22,12 @@ namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
+  std::ifstream model_file(FLAGS_infer_model + "/__model__");
+  if (model_file.good())
+    cfg->SetModel(FLAGS_infer_model);
+  else
+    cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel",
+                  FLAGS_infer_model + "/inference.pdiparams");
   cfg->DisableGpu();
   cfg->SwitchIrOptim();
   cfg->SwitchSpecifyInputNames();
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 637fa16e31ba7996713a6971c3a1802627811e7f..e63dfd14175b9955fbf5b6fdb0fb7904a330f264 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -213,15 +213,15 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
                     element_in_batch * 3 * 224 * 224,
                 3 * 224 * 224,
                 static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
-
-    std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
-                    element_in_batch,
-                1, static_cast<int64_t *>(labels.data.data()) + i);
+    if (FLAGS_with_accuracy_layer)
+      std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
+                      element_in_batch,
+                  1, static_cast<int64_t *>(labels.data.data()) + i);
   }
-
-  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
+  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(
+      FLAGS_with_accuracy_layer ? 2 : 1);
   (*warmup_data)[0] = std::move(images);
-  (*warmup_data)[1] = std::move(labels);
+  if (FLAGS_with_accuracy_layer) (*warmup_data)[1] = std::move(labels);
   return warmup_data;
 }
 
@@ -254,9 +254,13 @@ void SetInputs(std::vector<std::vector<PaddleTensor>> *inputs,
   }
   for (auto i = 0; i < iterations; i++) {
     auto images = image_reader.NextBatch();
-    auto labels = label_reader.NextBatch();
-    inputs->emplace_back(
-        std::vector<PaddleTensor>{std::move(images), std::move(labels)});
+    std::vector<PaddleTensor> tmp_vec;
+    tmp_vec.push_back(std::move(images));
+    if (FLAGS_with_accuracy_layer) {
+      auto labels = label_reader.NextBatch();
+      tmp_vec.push_back(std::move(labels));
+    }
+    inputs->push_back(std::move(tmp_vec));
   }
 }
 
@@ -825,7 +829,8 @@ void CompareQuantizedAndAnalysis(
   SummarizePerformance("FP32", sample_latency_fp32, "INT8",
                        sample_latency_int8);
 
-  CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
+  if (FLAGS_with_accuracy_layer)
+    CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
 }
 
 void CompareBFloat16AndAnalysis(
@@ -864,7 +869,8 @@ void CompareBFloat16AndAnalysis(
   SummarizePerformance("FP32", sample_latency_fp32, "BF16",
                        sample_latency_bf16);
 
-  CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx);
+  if (FLAGS_with_accuracy_layer)
+    CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx);
 }
 
 void CompareAnalysisAndAnalysis(
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 9d83f8ff8fdc4756450c0fe9ae4d7096d9afa76f..f376cbd4fb302b1d7a038d958465f24db653e220 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -5,6 +5,7 @@ option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 option(WITH_GTEST "Compile demo with GTEST"   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -172,6 +180,16 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -248,6 +266,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh
index dd4b64f28d739776ee750205d41b4dce35a97640..8123d3785003471fd5f63f24fbb1166913d7e571 100755
--- a/paddle/fluid/inference/tests/infer_ut/run.sh
+++ b/paddle/fluid/inference/tests/infer_ut/run.sh
@@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT
-MSVC_STATIC_CRT=$6
+WITH_ONNXRUNTIME=$6
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 EXIT_CODE=0 # init default exit code
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
@@ -144,7 +145,8 @@ function compile_test() {
              -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
              -DWITH_GTEST=ON \
              -DCMAKE_CXX_FLAGS='/std:c++17' \
-             -DCMAKE_BUILD_TYPE=Release
+             -DCMAKE_BUILD_TYPE=Release \
+             -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj
     else
         cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -154,7 +156,8 @@ function compile_test() {
                  -DWITH_STATIC_LIB=OFF \
                  -DUSE_TENSORRT=$USE_TENSORRT \
                  -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-                 -DWITH_GTEST=ON
+                 -DWITH_GTEST=ON \
+                 -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         make -j$(nproc)
     fi;
     cd -
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 05c468b798886ac135ed30bff75ce9400f1ca3a1..6b6c0cd22f03b902f08d7a79236b1091b9fe6677 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc
 endif()
 set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
 
+if(WITH_ONNXRUNTIME)
+  set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2")
+  if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz)
+    inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz")
+  endif()
+  set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2")
+endif()
+
 function (inference_base_test_build TARGET)
    set(options "")
    set(oneValueArgs "")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 6cd7d87332323f4bafd49b8b16254f9610405658..a7a417c29a7bdb7a47d4798246de55c0bd3536f9 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -17,7 +17,7 @@ if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
   nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
 
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4d0e485285146e5668793d29fd8effc789fcc339..61e292a922f0e98a958d4fe2f8fc7850bdf47e18 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -193,10 +193,10 @@ class AllocatorFacadePrivate {
         }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
         for (const auto& dev_type : device_types) {
           for (size_t dev_id = 0;
-               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
                ++dev_id) {
             InitNaiveBestFitCustomDeviceAllocator(
                 platform::CustomPlace(dev_type, dev_id));
@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
         InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
-          }
-        } else {
+        if (!FLAGS_use_stream_safe_cuda_allocator) {
           for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -240,10 +235,10 @@ class AllocatorFacadePrivate {
         }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
         for (const auto& dev_type : device_types) {
           for (size_t dev_id = 0;
-               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
                ++dev_id) {
             InitAutoGrowthCustomDeviceAllocator(
                 platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
     }
 
     CheckAllocThreadSafe();
+
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+      WrapCUDAGraphAllocator();
+    }
+#endif
   }
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
                                 allocation.get()));
     return stream_safe_cuda_allocation->GetOwningStream();
   }
-
-#ifdef PADDLE_WITH_CUDA
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
-    }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
-  }
-
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
-#endif
 #endif
 
  private:
@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
     platform::Place place_;
   };
 
-  const AllocatorMap& GetAllocatorMap() {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
-      auto id = platform::CUDAGraph::CapturingID();
-      auto iter = cuda_graph_allocator_map_.find(id);
-      PADDLE_ENFORCE_NE(
-          iter, cuda_graph_allocator_map_.end(),
-          platform::errors::PermissionDenied(
-              "No memory pool is prepared for CUDA Graph capturing."));
-      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
-      return iter->second->allocators_;
-    } else {
-      return allocators_;
-    }
-#else
-    return allocators_;
-#endif
-  }
+  const AllocatorMap& GetAllocatorMap() { return allocators_; }
 
   void InitNaiveBestFitCPUAllocator() {
     allocators_[platform::CPUPlace()] =
@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
   }
 
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    const std::shared_ptr<Allocator>& underlying_allocator =
-        cuda_allocators_[p][stream];
-    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
-        underlying_allocator, p, stream);
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<StreamSafeCUDAAllocator>(
+        allocator, p, stream,
+        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
   }
 
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
         retry_time, 0,
         platform::errors::InvalidArgument(
             "Retry time should be larger than 0, but got %d", retry_time));
-    std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
+#ifdef PADDLE_WITH_CUDA
+  void WrapCUDAGraphAllocator() {
+    for (auto& item : allocators_) {
+      auto& allocator = item.second;
+      allocator = CUDAGraphAllocator::Create(allocator);
+    }
+  }
+#endif
+
   static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
     for (auto& place_pair : allocators) {
       for (auto& stream_pair : place_pair.second) {
@@ -738,7 +698,7 @@ class AllocatorFacadePrivate {
     auto custom_allocator =
         std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
+        custom_allocator, phi::DeviceManager::GetMinChunkSize(p),
         allow_free_idle_chunk);
   }
 #endif
@@ -814,11 +774,10 @@ class AllocatorFacadePrivate {
     }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
     for (const auto& dev_type : device_types) {
       for (size_t dev_id = 0;
-           dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
-           dev_id++) {
+           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
         places.emplace_back(platform::CustomPlace(dev_type, dev_id));
       }
     }
@@ -865,10 +824,6 @@ class AllocatorFacadePrivate {
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
   std::shared_timed_mutex cuda_allocator_mutex_;
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
-      cuda_graph_allocator_map_;
-#endif
 #endif
   AllocatorStrategy strategy_;
   AllocatorMap allocators_;
@@ -887,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
 AllocatorFacade::~AllocatorFacade() {}
 
 AllocatorFacade& AllocatorFacade::Instance() {
-  static AllocatorFacade instance;
-  return instance;
+  static AllocatorFacade* instance = new AllocatorFacade;
+  return *instance;
+}
+
+AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    auto id = platform::CUDAGraph::CapturingID();
+    auto iter = cuda_graph_map_.find(id);
+    PADDLE_ENFORCE_NE(
+        iter, cuda_graph_map_.end(),
+        platform::errors::PermissionDenied(
+            "No memory pool is prepared for CUDA Graph capturing."));
+    VLOG(10) << "Choose CUDA Graph memory pool";
+    return iter->second.get();
+  }
+#endif
+  return m_;
 }
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
@@ -896,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-
+    AllocatorFacadePrivate* m = GetPrivate();
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
+    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
   }
 #endif
 
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 
 void* AllocatorFacade::GetBasePtr(
@@ -923,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
                         "GetBasePtr() is only implemented for CUDAPlace(), not "
                         "suppot place: %s",
                         allocation->place()));
-  return m_->GetBasePtr(allocation);
+  return GetPrivate()->GetBasePtr(allocation);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -931,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place, const gpuStream_t& stream) {
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-    return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
   }
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 #endif
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
     const platform::Place& place) {
-  return m_->GetAllocator(place, /* zero size */ 0);
+  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
 }
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
@@ -958,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place, size)->Allocate(size);
-    }
-#endif
-
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
+    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
+        GetPrivate()->GetDefaultStream(cuda_place)));
+    return Alloc(cuda_place, size, default_stream);
   }
 #endif
-
-  return m_->GetAllocator(place, size)->Allocate(size);
+  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_
-          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
-          ->Release(place);
-    }
-#endif
-
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, m_->GetDefaultStream(cuda_place));
+    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
   }
 #endif
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+  return GetPrivate()
+      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
 }
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -1002,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
           "multi-stream 'AllocaShared' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<phi::Allocation>(Alloc(place, size, s));
-#else
-  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
-#endif
+  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
 }
 
-bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<phi::Allocation>& allocation,
-    const phi::Stream& stream) {
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
           "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'InSameStream' function. To enable it, you can enter"
+          "multi-stream 'Alloc' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
 
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  platform::CUDAPlace p(place.GetDeviceId());
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+    return GetPrivate()
+        ->GetAllocator(p, s, /* create_if_not_found = */ true)
+        ->Allocate(size);
+  } else {
+    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
   }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return s == GetStream(allocation);
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }
 
+bool AllocatorFacade::InSameStream(
+    const std::shared_ptr<phi::Allocation>& allocation,
+    const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
-                                     const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
           "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'Alloc' function. To enable it, you can enter"
+          "multi-stream 'InSameStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return s == GetStream(allocation);
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  platform::CUDAPlace p(place.GetDeviceId());
-  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
-        ->Allocate(size);
-  } else {
-    return m_->GetAllocator(p, size)->Allocate(size);
-  }
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
@@ -1076,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
           "multi-stream 'Release' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetAllocator(place, stream)->Release(place);
+  return GetPrivate()->GetAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -1096,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
           "'RecordStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  m_->RecordStream(allocation, stream);
+  GetPrivate()->RecordStream(allocation, stream);
 }
 
 const gpuStream_t& AllocatorFacade::GetStream(
@@ -1116,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
           "'GetStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetStream(allocation);
+  return GetPrivate()->GetStream(allocation);
 }
 
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-  return m_->PrepareMemoryPoolForCUDAGraph(id);
+  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported when the "
+                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                        "FLAGS_allocator_strategy=\"%s\"",
+                        FLAGS_allocator_strategy));
+  auto& allocator = cuda_graph_map_[id];
+  PADDLE_ENFORCE_EQ(
+      allocator.get(), nullptr,
+      platform::errors::InvalidArgument(
+          "The memory pool of the CUDA Graph with ID %d have been prepared.",
+          id));
+  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
 
 void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-  return m_->RemoveMemoryPoolOfCUDAGraph(id);
+  auto iter = cuda_graph_map_.find(id);
+  PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find CUDA Graph with ID = %d", id));
+  cuda_graph_map_.erase(iter);
+  VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
 }
 #endif
 #endif
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 1722a06b01f1302c3bb1f98c99af0431ab62f955..9066bb284e28af197111b5d3ea129cc65b5fe914 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -49,6 +49,8 @@ class AllocatorFacade {
 
   static AllocatorFacade& Instance();
 
+  AllocatorFacadePrivate* GetPrivate() const;
+
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
@@ -73,13 +75,14 @@ class AllocatorFacade {
                                           size_t size,
                                           const phi::Stream& stream);
 
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      const phi::Stream& stream);
+
   bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                     const phi::Stream& stream);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
-  AllocationPtr Alloc(const platform::Place& place, size_t size,
-                      const gpuStream_t& stream);
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
@@ -96,6 +99,10 @@ class AllocatorFacade {
  private:
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+      cuda_graph_map_;
+#endif
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index bd52c8f4ad270f0f70a23ab39b78bd9363ede769..e53d7b1cc766a3e277ef0a773671ef678bcb3ac7 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -32,17 +32,16 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) {
 }
 
 phi::Allocation* CustomAllocator::AllocateImpl(size_t size) {
-  std::call_once(once_flag_,
-                 [this] { platform::DeviceManager::SetDevice(place_); });
+  std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); });
 
   void* ptr =
-      platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
+      phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
   if (LIKELY(ptr)) {
     return new Allocation(ptr, size, place_);
   }
 
   size_t avail, total;
-  platform::DeviceManager::MemoryStats(place_, &total, &avail);
+  phi::DeviceManager::MemoryStats(place_, &total, &avail);
 
   auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
   auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index ea6d7019be6c1caf4844469276f3113525b33dfc..0bfbe2c6962294fc7e4aa2fff079e9cf411f26f8 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -739,7 +739,7 @@ class BuddyAllocatorList {
  private:
   explicit BuddyAllocatorList(const std::string &device_type)
       : device_type_(device_type) {
-    auto devices = platform::DeviceManager::GetDeviceList(device_type);
+    auto devices = phi::DeviceManager::GetDeviceList(device_type);
     for (auto dev_id : devices) {
       init_flags_[dev_id].reset(new std::once_flag());
     }
@@ -766,15 +766,15 @@ class BuddyAllocatorList {
                           device_type_, dev_id));
 
     std::call_once(*init_flags_[dev_id], [this, dev_id] {
-      platform::DeviceManager::SetDevice(device_type_, dev_id);
+      phi::DeviceManager::SetDevice(device_type_, dev_id);
       platform::CustomPlace place(device_type_, dev_id);
 
       allocators_[dev_id].reset(new BuddyAllocator(
           std::unique_ptr<detail::SystemAllocator>(
               new detail::CustomAllocator(device_type_, dev_id)),
-          platform::DeviceManager::GetMinChunkSize(place),
-          platform::DeviceManager::GetMaxChunkSize(place),
-          platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetMinChunkSize(place),
+          phi::DeviceManager::GetMaxChunkSize(place),
+          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
     });
 
     return allocators_[dev_id].get();
@@ -808,9 +808,9 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
   auto *ptr = buddy_allocator->Alloc(size);
 
   if (ptr == nullptr) {
-    platform::DeviceGuard guard(place);
+    phi::DeviceGuard guard(place);
     size_t avail, total;
-    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    phi::DeviceManager::MemoryStats(place, &total, &avail);
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
         "%s. ",
@@ -819,8 +819,7 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
         string::HumanReadableSize(total - avail)));
   } else {
     if (FLAGS_init_allocated_mem) {
-      platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
-                                                                    size);
+      phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size);
     }
   }
   VLOG(10) << "  pointer=" << ptr;
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 8627e3e6f8811e162ce3014c01145f331a03ee4b..072c4dee3bc45b4ff5f23f5288d3412a14f63b0f 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -15,56 +15,52 @@
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream,
+    StreamSafeCUDAAllocator* allocator)
     : Allocation(underlying_allocation->ptr(),
                  underlying_allocation->base_ptr(),
                  underlying_allocation->size(), underlying_allocation->place()),
       underlying_allocation_(std::move(underlying_allocation)),
-      owning_stream_(std::move(owning_stream)) {}
+      owning_stream_(std::move(owning_stream)),
+      allocator_(allocator->shared_from_this()) {}
 
 void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
   VLOG(8) << "Try record stream " << stream << " for address " << ptr();
   if (stream == owning_stream_) {
-    VLOG(9) << "Record the same stream of " << stream;
     return;
   }
 
   std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
-  gpuEvent_t record_event;
-  auto it = outstanding_event_map_.find(stream);
-  if (it == outstanding_event_map_.end()) {
-    gpuEvent_t new_event;
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
-#endif
-    outstanding_event_map_[stream] = new_event;
-    record_event = new_event;
-    VLOG(9) << "Create a new event " << new_event;
-  } else {
-    record_event = it->second;
-    VLOG(9) << "Reuse event " << record_event;
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    graph_capturing_stream_set_.insert(stream);
+    return;
   }
-
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
-  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+
+  RecordStreamWithNoGraphCapturing(stream);
+  RecordGraphCapturingStreams();
 }
 
 bool StreamSafeCUDAAllocation::CanBeFreed() {
-  // NOTE(Ruibiao): This function will not execute concurrently,
-  // so outstanding_event_lock_ is not required here
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    return graph_capturing_stream_set_.empty() &&
+           outstanding_event_map_.empty();
+  }
+#endif
+
+  RecordGraphCapturingStreams();
+
   for (auto it = outstanding_event_map_.begin();
        it != outstanding_event_map_.end(); ++it) {
     gpuEvent_t& event = it->second;
@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
   return owning_stream_;
 }
 
+void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() {
+  for (gpuStream_t stream : graph_capturing_stream_set_) {
+    RecordStreamWithNoGraphCapturing(stream);
+  }
+  graph_capturing_stream_set_.clear();
+}
+
+void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
+    const gpuStream_t& stream) {
+  gpuEvent_t record_event;
+  auto it = outstanding_event_map_.find(stream);
+  if (it == outstanding_event_map_.end()) {
+    gpuEvent_t new_event;
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
+#endif
+    outstanding_event_map_[stream] = new_event;
+    record_event = new_event;
+    VLOG(9) << "Create a new event " << new_event;
+  } else {
+    record_event = it->second;
+    VLOG(9) << "Reuse event " << record_event;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
+#endif
+  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+}
+
 StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
     std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
-    gpuStream_t default_stream)
+    gpuStream_t default_stream, bool in_cuda_graph_capturing)
     : underlying_allocator_(std::move(underlying_allocator)),
       place_(std::move(place)),
-      default_stream_(std::move(default_stream)) {
-  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  allocator_map_[place].emplace_back(this);
+      default_stream_(std::move(default_stream)),
+      in_cuda_graph_capturing_(in_cuda_graph_capturing) {
+  if (LIKELY(!in_cuda_graph_capturing)) {
+    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+    allocator_map_[place].emplace_back(this);
+  }
 }
 
 StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
-  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
-  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
-                   allocators.end());
+  if (LIKELY(!in_cuda_graph_capturing_)) {
+    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+    std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
+    allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
+                     allocators.end());
+  }
 }
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   }
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
       static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
-      default_stream_);
+      default_stream_, this);
   VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
           << allocation->ptr();
   return allocation;
@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
                               "StreamSafeCUDAAllocation*",
                               allocation));
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
-  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
     delete stream_safe_cuda_allocation;
   } else {
     VLOG(9) << "Put into unfreed_allocation list";
+    std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
     unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
   }
 }
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  if (UNLIKELY(in_cuda_graph_capturing_)) {
+    VLOG(7) << "Memory release forbidden in CUDA Graph Captruing";
+    return 0;
+  }
+
   std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
   std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
   uint64_t released_size = 0;
   for (StreamSafeCUDAAllocator* allocator : allocators) {
-    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
+    released_size += allocator->ProcessUnfreedAllocationsAndRelease();
   }
   VLOG(8) << "Release " << released_size << " bytes memory from all streams";
   return released_size;
@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
   }
 }
 
-uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
+uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() {
   ProcessUnfreedAllocations();
   return underlying_allocator_->Release(place_);
 }
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 7354836308cfba0338fb2e146cc14182006876ee..ecddff97c206be968148e32ddf3f9c6623bf8bde 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <deque>
 #include <list>
 #include <map>
-#include <mutex>
+#include <set>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,27 +31,38 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class StreamSafeCUDAAllocator;
+
 class StreamSafeCUDAAllocation : public Allocation {
  public:
   StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
-                           gpuStream_t owning_stream);
+                           gpuStream_t owning_stream,
+                           StreamSafeCUDAAllocator *allocator);
+
   void RecordStream(const gpuStream_t &stream);
   bool CanBeFreed();
-
   const gpuStream_t &GetOwningStream() const;
 
  private:
+  void RecordGraphCapturingStreams();
+  void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream);
   DecoratedAllocationPtr underlying_allocation_;
+  std::set<gpuStream_t> graph_capturing_stream_set_;
   std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
   gpuStream_t owning_stream_;
   SpinLock outstanding_event_map_lock_;
+  // To compatiable with CUDA Graph, hold the allocator shared_ptr so that
+  // Allocator will not deconstruct before Allocation
+  std::shared_ptr<Allocator> allocator_;
 };
 
-class StreamSafeCUDAAllocator : public Allocator {
+class StreamSafeCUDAAllocator
+    : public Allocator,
+      public std::enable_shared_from_this<StreamSafeCUDAAllocator> {
  public:
   StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
-                          platform::CUDAPlace place,
-                          gpuStream_t default_stream);
+                          platform::CUDAPlace place, gpuStream_t default_stream,
+                          bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
   bool IsAllocThreadSafe() const override;
 
@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
 
  private:
   void ProcessUnfreedAllocations();
-  uint64_t ProcessUnfreedAllocationsWithRelease();
+  uint64_t ProcessUnfreedAllocationsAndRelease();
 
   static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
       allocator_map_;
@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
   gpuStream_t default_stream_;
   std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
   SpinLock unfreed_allocation_lock_;
+
+  bool in_cuda_graph_capturing_;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29..076a96139612168f6c3d5d039184ccdb7a536f2e 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -26,6 +26,7 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
@@ -43,11 +44,11 @@ BuddyAllocator::BuddyAllocator(
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (!dev_type.empty()) {
     init_allocate_size_func_ = [dev_type]() {
-      return platform::DeviceManager::GetInitAllocSize(
+      return phi::DeviceManager::GetInitAllocSize(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
     re_allocate_size_func_ = [dev_type]() {
-      return platform::DeviceManager::GetReallocSize(
+      return phi::DeviceManager::GetReallocSize(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
   } else {
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index a61f98c4e1a22adcc3684a9e5af190a82e3b5110..37ac0b4483291c8c3a3eeb31883c55c7eda24dc8 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -438,7 +438,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
   void* p;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
-  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   p = device->MemoryAllocate(size);
   if (LIKELY(p)) {
     VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
@@ -447,7 +447,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
   } else {
     size_t avail, total;
 
-    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    phi::DeviceManager::MemoryStats(place, &total, &avail);
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on %s %d. "
         "total memory is %s, used memory is %s, "
@@ -470,7 +470,7 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) {
                         size, plug_alloc_size));
   plug_alloc_size -= size;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
-  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   device->MemoryDeallocate(p, size);
 }
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index b60bb4fc1d1bb5e4366625277db8fdb968474891..2bca2c388a05958fda0e891190dcf7e7ddc53b0c 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                              stream);
 }
 
+AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                    const phi::Stream& stream) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
+}
+
 bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                   const phi::Stream& stream) {
   return allocation::AllocatorFacade::Instance().InSameStream(allocation,
@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                    const gpuStream_t& stream) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
-}
-
 uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 89b4caa5bed26fa9b8d0bf09df702f17a310dff6..601fe3f2a42c391c602887bacccae97125b951e1 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size,
                                                const phi::Stream& stream);
 
+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const phi::Stream& stream);
+
 extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                          const phi::Stream& stream);
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                           const gpuStream_t& stream);
-
 extern uint64_t Release(const platform::CUDAPlace& place,
                         const gpuStream_t& stream);
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 166cdd0b5d6b6a523cfe470662951184ebbfabc5..3198b4f8d935e3815ba94db945a24ab4df4ca97b 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -44,9 +44,9 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << ", stream=" << stream;
 
-  platform::DeviceManager::SetDevice(src_place);
-  platform::stream::Stream stream_wrapper(src_place, stream);
-  platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
+  phi::DeviceManager::SetDevice(src_place);
+  phi::stream::Stream stream_wrapper(src_place, stream);
+  phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
       dst, src, num, &stream_wrapper);
 }
 
@@ -62,9 +62,9 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << ", stream=" << stream;
 
-  platform::DeviceManager::SetDevice(dst_place);
-  platform::stream::Stream stream_wrapper(dst_place, stream);
-  platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
+  phi::DeviceManager::SetDevice(dst_place);
+  phi::stream::Stream stream_wrapper(dst_place, stream);
+  phi::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
       dst, src, num, &stream_wrapper);
 }
 
@@ -82,16 +82,16 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
           << dst_place << ", stream=" << stream;
 
   if (src_type == dst_type) {
-    platform::DeviceManager::SetDevice(src_place);
-    platform::stream::Stream stream_wrapper(src_place, stream);
+    phi::DeviceManager::SetDevice(src_place);
+    phi::stream::Stream stream_wrapper(src_place, stream);
 
     auto src_id = platform::PlaceHelper::GetDeviceId(src_place);
     auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place);
     if (src_id == dst_id) {
-      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
+      phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
           dst, src, num, &stream_wrapper);
     } else {
-      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
+      phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
           dst_place, dst, src, num, &stream_wrapper);
     }
   } else {
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 933717f3090c4b25f912e0bbe87922a1494c128a..5e4a4234bb41663f2287203fa9123029e6894036 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -12,34 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/stream.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 namespace paddle {
 namespace memory {
 
-__global__ void add_kernel(int *x, int n) {
+// y += (x + 1)
+__global__ void add_kernel(int *x, int *y, int n) {
   int thread_num = gridDim.x * blockDim.x;
   int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = thread_id; i < n; i += thread_num) {
-    atomicAdd(x + i, thread_id);
+    y[i] += x[i] + 1;
   }
 }
 
@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
                                  << " there may be a memory leak problem";
 }
 
-class StreamSafeCUDAAllocTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    place_ = platform::CUDAPlace();
-    stream_num_ = 64;
-    grid_num_ = 1;
-    block_num_ = 32;
-    data_num_ = 131072;
-    workspace_size_ = data_num_ * sizeof(int);
-
-    // alloc workspace for each stream
-    for (size_t i = 0; i < stream_num_; ++i) {
-      gpuStream_t stream;
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
-#endif
-
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, workspace_size_,
-                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaMemset(allocation->ptr(), 0, allocation->size()));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          hipMemset(allocation->ptr(), 0, allocation->size()));
-#endif
-
-      streams_.emplace_back(stream);
-      workspaces_.emplace_back(allocation);
-    }
-
-    result_ = Alloc(place_, stream_num_ * workspace_size_);
-  }
-
-  void SingleStreamRun(size_t idx) {
-    // for all stream i,
-    // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
-    for (size_t i = 0; i < stream_num_; ++i) {
-      int *x = reinterpret_cast<int *>(workspaces_[i]->ptr());
-      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
-      RecordStream(workspaces_[i], streams_[idx]);
-    }
-  }
-
-  void CopyResultAsync() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice));
-#endif
-    }
-  }
-
-  void MultiStreamRun() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-      SingleStreamRun(i);
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void MultiThreadMUltiStreamRun() {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads.push_back(
-          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
-    }
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads[i].join();
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void CheckResult() {
-    auto result_host = std::unique_ptr<int[]>(new int[result_->size()]);
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(),
-                                          result_->size(),
-                                          cudaMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(),
-                                         result_->size(),
-                                         hipMemcpyDeviceToHost));
-#endif
-    size_t thread_num = grid_num_ * block_num_;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      for (size_t j = 0; j < data_num_; ++j) {
-        EXPECT_TRUE(result_host[i * stream_num_ + j] ==
-                    (j % thread_num) * stream_num_);
-      }
-    }
-    result_.reset();
-  }
-
-  void TearDown() override {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#endif
-    for (gpuStream_t stream : streams_) {
-      Release(place_, stream);
-    }
-
-    for (size_t i = 1; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
-#endif
-    }
-
-    CheckMemLeak(place_);
-  }
-
-  size_t stream_num_;
-  size_t grid_num_;
-  size_t block_num_;
-  size_t data_num_;
-  size_t workspace_size_;
-  platform::CUDAPlace place_;
-  std::vector<gpuStream_t> streams_;
-  std::vector<std::shared_ptr<Allocation>> workspaces_;
-  allocation::AllocationPtr result_;
-};
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
-  MultiStreamRun();
-  CheckResult();
-}
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
-  MultiThreadMUltiStreamRun();
-  CheckResult();
-}
-
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
           paddle::platform::DeviceContextPool::Instance().Get(place))
           ->stream();
   allocation::AllocationPtr allocation_unique =
-      Alloc(place, alloc_size, default_stream);
+      Alloc(place, alloc_size,
+            phi::Stream(reinterpret_cast<phi::StreamId>(default_stream)));
   EXPECT_GE(allocation_unique->size(), alloc_size);
   EXPECT_EQ(allocation_unique->ptr(), address);
   allocation_unique.reset();
@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
   CheckMemLeak(place);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
-  platform::CUDAPlace place = platform::CUDAPlace();
-  size_t alloc_size = 1;
-  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
-
-  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
-  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(
-      AllocShared(place, alloc_size,
-                  phi::Stream(reinterpret_cast<phi::StreamId>(nullptr))),
-      paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(RecordStream(allocation, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet);
-  platform::EndCUDAGraphCapture();
-
-  allocation.reset();
-  Release(place);
-  CheckMemLeak(place);
-}
-#endif
-
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation1 = Alloc(
+      place, alloc_size, phi::Stream(reinterpret_cast<phi::StreamId>(stream1)));
   allocation::AllocationPtr allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = Alloc(place, alloc_size, stream2);
+    allocation2 = Alloc(place, alloc_size,
+                        phi::Stream(reinterpret_cast<phi::StreamId>(stream2)));
   });
   allocation1.reset();  // free but not release
   th.join();
@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   CheckMemLeak(place);
 }
 
+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 32;
+    data_num_ = 131072;
+    workspace_size_ = data_num_ * sizeof(int);
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
+#endif
+
+      std::shared_ptr<phi::Allocation> workspace_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> result_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> host_result_allocation =
+          AllocShared(platform::CPUPlace(), workspace_size_);
+
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0,
+                                            workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0,
+                                           workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#endif
+
+      streams_.emplace_back(stream);
+      workspaces_.emplace_back(workspace_allocation);
+      results_.emplace_back(result_allocation);
+      host_results_.emplace_back(host_result_allocation);
+    }
+  }
+
+  void SingleStreamRun(size_t idx) {
+    int *y = reinterpret_cast<int *>(results_[idx]->ptr());
+    int neighbouring_idx = idx > 0 ? idx - 1 : idx;
+
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[idx]->ptr()), y, data_num_);
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[neighbouring_idx]->ptr()), y,
+        data_num_);
+    RecordStream(workspaces_[neighbouring_idx], streams_[idx]);
+  }
+
+  void MultiStreamRun() {
+    // Must run in reverse order, or the workspace_[i - 1] will be released
+    // before streams_[i]'s kernel launch
+    for (int i = stream_num_ - 1; i >= 0; --i) {
+      SingleStreamRun(i);
+      workspaces_[i].reset();  // fast GC
+    }
+  }
+
+  void MultiThreadMultiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    workspaces_.clear();
+  }
+
+  void CUDAGraphRun() {
+    testing_cuda_graph_ = true;
+    platform::BeginCUDAGraphCapture(platform::CUDAPlace(),
+                                    cudaStreamCaptureModeGlobal);
+
+    std::shared_ptr<Allocation> data_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+    std::shared_ptr<Allocation> result_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+
+    int *data = static_cast<int *>(data_allocation->ptr());
+    int *result = static_cast<int *>(result_allocation->ptr());
+
+    gpuStream_t main_stream = GetStream(data_allocation);
+    gpuStream_t other_stream;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream));
+
+    add_kernel<<<grid_num_, block_num_, 0, main_stream>>>(data, result,
+                                                          data_num_);
+    RecordStream(data_allocation, other_stream);
+
+    std::unique_ptr<platform::CUDAGraph> cuda_graph =
+        platform::EndCUDAGraphCapture();
+
+    int replay_times = 10;
+    for (int i = 0; i < replay_times; ++i) {
+      cuda_graph->Replay();
+    }
+
+    std::shared_ptr<Allocation> host_result_allocation =
+        AllocShared(platform::CPUPlace(), workspace_size_);
+    Copy(host_result_allocation->place(), host_result_allocation->ptr(),
+         result_allocation->place(), result_allocation->ptr(), workspace_size_,
+         main_stream);
+    cudaStreamSynchronize(main_stream);
+
+    int *host_result = static_cast<int *>(host_result_allocation->ptr());
+    for (int i = 0; i < data_num_; ++i) {
+      EXPECT_EQ(host_result[i], replay_times);
+    }
+
+    data_allocation.reset();
+    result_allocation.reset();
+    cuda_graph.release();
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream));
+  }
+
+  void CheckResult() {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      Copy(host_results_[i]->place(), host_results_[i]->ptr(),
+           results_[i]->place(), results_[i]->ptr(), workspace_size_,
+           streams_[i]);
+    }
+    cudaDeviceSynchronize();
+
+    size_t thread_num = grid_num_ * block_num_;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *result = static_cast<int *>(host_results_[i]->ptr());
+      for (size_t j = 0; j < data_num_; ++j) {
+        EXPECT_EQ(result[j], 2);
+      }
+    }
+  }
+
+  void TearDown() override {
+    workspaces_.clear();
+    results_.clear();
+    host_results_.clear();
+    for (gpuStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
+    }
+
+    // Memory release for CUDA Graph memory pool is forbidden
+    if (!testing_cuda_graph_) {
+      CheckMemLeak(place_);
+    }
+  }
+
+  bool testing_cuda_graph_{0};
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  size_t workspace_size_;
+  platform::CUDAPlace place_;
+  std::vector<gpuStream_t> streams_;
+  std::vector<std::shared_ptr<phi::Allocation>> workspaces_;
+  std::vector<std::shared_ptr<phi::Allocation>> results_;
+  std::vector<std::shared_ptr<phi::Allocation>> host_results_;
+};
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  MultiStreamRun();
+  CheckResult();
+}
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  MultiThreadMultiStreamRun();
+  CheckResult();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
+  MultiStreamRun();
+  CUDAGraphRun();
+  CheckResult();
+}
+#endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 91a0352e1915e95378012aa398ff996cbc10f216..e77be832c0cc8975c3fc2ebb7fad577cdfe919f5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling segment_pooling executor device_memory_aligment generator)
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index c28026a4bd43aac5b0c447e24a164e27233076e8..e1460629fb18a4259731c2c9de4ed8f623b5a1e4 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 0ac29e6d3ada7335cab510ef82c9f46d2da7eb05..b4a97e24cf29233776b19aa0ea7764a00435f6fc 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
       : CudnnActivationGradFunctor<T>(ctx, 6.0,
                                       GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename Functor>
@@ -197,7 +205,8 @@ class CudnnActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
+    static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut,
+                  "Forward deps must be Out.");
 
     const framework::Tensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 73d65b7c6e7e0a5be2d680afba971d54b492c05d..66f1bcc8b68692abe588b6429b027462eaebde24 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -34,7 +34,8 @@ using paddle::framework::Tensor;
 
 template <typename GradFunctor>
 static constexpr bool CanInplaceAct() {
-  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
+  return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut ||
+         GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps;
 }
 
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
@@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DOut")) {
         ctx->ShareDim("Out", "DOut");
         ctx->ShareLoD("Out", "DOut");
@@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("X", "DDOut");
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("Out", "DDOut");
         ctx->ShareLoD("Out", "DDOut");
@@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("D_DOut")) {
         ctx->ShareDim("Out", "D_DOut");
         ctx->ShareLoD("Out", "D_DOut");
@@ -1464,6 +1471,18 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor)
+REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor);
+REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor);
+REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor);
+REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor);
+REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor);
+REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor);
+REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
+REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
+REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
+REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+
 /* ==========================    sigmoid register  =============================
  */
 // 1. Register Sigmoid Operator
@@ -1584,16 +1603,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::ReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== leaky relu register  ============================ */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ff41da86f7bb6ba8406d58804888b5dcd8bc3be0..4b79397b6cdf2e5c2993f7a72f512cc924c208e7 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -35,16 +35,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
 using framework::To32BitIndex;
 
-enum ActBwdOpFwdDeps {
-  kNoDeps = 0x00,  // Do not need any forward input/output
-  kDepX = 0x01,    // Only need forward input X
-  kDepOut = 0x02,  // Only need forward output Out
-};
+using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps;
 
 /* The following operator can be used to process SelectedRows, because the
  * output of those operator for zero is zero too.
@@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor(
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
   const framework::Variable* out_var = nullptr;
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     out_var = context.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var, platform::errors::NotFound(
@@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor(
                               "Output(Out), variable name = %s",
                               context.OutputName(framework::GradVarName("X"))));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound(
                                        "Cannot get the tensor from the "
@@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+#define USE_PHI_FUNCTOR(name)                         \
+  template <typename T>                               \
+  using name##Functor = phi::funcs::name##Functor<T>; \
+  template <typename T>                               \
+  using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
+
+USE_PHI_FUNCTOR(Cos)
+USE_PHI_FUNCTOR(Tan)
+USE_PHI_FUNCTOR(Acos)
+USE_PHI_FUNCTOR(Sin)
+USE_PHI_FUNCTOR(Asin)
+USE_PHI_FUNCTOR(Atan)
+USE_PHI_FUNCTOR(Sinh)
+USE_PHI_FUNCTOR(Cosh)
+USE_PHI_FUNCTOR(Asinh)
+USE_PHI_FUNCTOR(Acosh)
+USE_PHI_FUNCTOR(Atanh)
+
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
@@ -256,7 +273,9 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
           (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // silu(x) = x / (1 + exp(-x))
@@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
                            (static_cast<T>(1) + (temp2 / temp1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
@@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
         dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // exp(x) = e^x
@@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // expm1(x) = e^x - 1
@@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // relu(x) = max(x, 0)
-template <typename T>
-struct ReluCPUFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
-      return v > static_cast<T>(0) ? v : static_cast<T>(0);
-    });
-  }
-};
 
 template <typename T>
-struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0));
-  }
-};
+using ReluCPUFunctor = phi::funcs::ReluCPUFunctor<T>;
+template <typename T>
+using ReluGradFunctor = phi::funcs::ReluGradFunctor<T>;
 
 template <typename T>
-struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
-  }
+using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
+template <typename T>
+using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 /*
     Out
@@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
                          static_cast<T>(2) * out * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
@@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // sqrt(x) = x^(1/2)
@@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // rsqrt(x) = x^(-1/2)
@@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // ceil(x) = ceiling(x)
@@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0) * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
+  }
 };
 
 // floor(x) = flooring(x)
@@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct Sine {
-  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
-};
-
-template <>
-struct Sine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sin(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosine {
-  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
-};
-
-template <>
-struct Cosine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(cos(static_cast<float>(val)));
-  }
-};
-
-// cosine'(x) = -sin(x)
-template <typename T>
-struct CosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosine(x) = cos(x)
-template <typename T>
-struct CosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosine<T>());
-  }
-};
-
-// sine'(x) = cos(x)
-template <typename T>
-struct SinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// sine(x) = sin(x)
-template <typename T>
-struct SinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sine<T>());
-  }
-};
-
-template <typename T>
-struct Tangent {
-  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
-};
-
-template <>
-struct Tangent<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(tan(static_cast<float>(val)));
-  }
-};
-
-// Tangent'(x) = -Tangent(x)
-template <typename T>
-struct TanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// Tangent(x) = tan(x)
-template <typename T>
-struct TanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Tangent<T>());
-  }
-};
-
-template <typename T>
-struct Sinh {
-  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
-};
-
-template <>
-struct Sinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sinhf(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosh {
-  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
-};
-
-template <>
-struct Cosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(coshf(static_cast<float>(val)));
-  }
-};
-
-// sinh(x) = sinh(x)
-template <typename T>
-struct SinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sinh<T>());
-  }
-};
-
-// cosh(x) = cosh(x)
-template <typename T>
-struct CoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosh<T>());
-  }
-};
-
-// sinh'(x) = cosh(x)
-template <typename T>
-struct SinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosh'(x) = sinh(x)
-template <typename T>
-struct CoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acos {
-  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
-};
-
-template <>
-struct Acos<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acos(static_cast<float>(val)));
-  }
-};
-
-// Acos(x) = acos(x)
-template <typename T>
-struct AcosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acos<T>());
-  }
-};
-
-// acos'(x) = -1/sqrt(1-x^2)
-template <typename T>
-struct AcosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asin {
-  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
-};
-
-template <>
-struct Asin<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asin(static_cast<float>(val)));
-  }
-};
-
-// Asin(x) = asin(x)
-template <typename T>
-struct AsinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asin<T>());
-  }
-};
-
-// asin'(x) = 1/sqrt(1-x^2)
-template <typename T>
-struct AsinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atan {
-  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
-};
-
-template <>
-struct Atan<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atan(static_cast<float>(val)));
-  }
-};
-
-// Atan(x) = atan(x)
-template <typename T>
-struct AtanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atan<T>());
-  }
-};
-
-// atan'(x) =  1 / (1 + x^2)
-template <typename T>
-struct AtanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acosh {
-  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
-};
-
-template <>
-struct Acosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acosh(static_cast<float>(val)));
-  }
-};
-
-// Acosh(x) = acosh(x)
-template <typename T>
-struct AcoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acosh<T>());
-  }
-};
-
-// acosh'(x) =  1/sqrt(x^2 - 1)
-template <typename T>
-struct AcoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asinh {
-  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
-};
-
-template <>
-struct Asinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asinh(static_cast<float>(val)));
-  }
-};
-
-// Asinh(x) = asinh(x)
-template <typename T>
-struct AsinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asinh<T>());
-  }
-};
-
-// asinh'(x) =  1/sqrt(x^2 + 1)
-template <typename T>
-struct AsinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atanh {
-  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
-};
-
-template <>
-struct Atanh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atanh(static_cast<float>(val)));
-  }
-};
-
-// Atanh(x) = atanh(x)
-template <typename T>
-struct AtanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atanh<T>());
-  }
-};
-
-// atanh'(x) =  1/(1 - x^2)
-template <typename T>
-struct AtanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // log(x) = natural logarithm of x
@@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log2(x) = logarithm to the base 2 of the elements of x
@@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log10(x) = logarithm to the base 10 of the elements of x
@@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log1p(x) = natural logarithm of x+1
@@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // square(x) = x^2
@@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(2) * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
                        .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // relu6(x) = min(max(0, x), 6)
@@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
             .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // HardSwish = min(max(0, x+3), 6) * x / 6
@@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
          static_cast<T>(1) * (static_cast<T>(1) - tmp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // For numerical stability, using the following formula instead of softplus(x) =
@@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
             .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // mish(x) = x * tanh(softplus(x))
@@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softsign(x) = x / (1 + |x|)
@@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
         dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * (out + static_cast<T>(alpha)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * static_cast<T>(alpha) * x.exp());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor<T> {
         dout * (x / static_cast<T>(alpha)).exp() * temp_a_neg * temp_x_neg;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
@@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
                    x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x > th).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
                    static_cast<T>(slope);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 /*
@@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor(
           "Cannot get the tensor from the Variable Output, variable name = %s",
           ctx.OutputName("DDX")));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = ctx.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(
         x_var, platform::errors::NotFound(
@@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor(
     VLOG(10) << "Inplace activation of Op: " << ctx.Type();
     *X = *ddX;
   }
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     auto out_var = ctx.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var,
@@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * x.sign();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
-      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
               .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(2) * x;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
@@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
     }
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 }  // namespace operators
@@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
   __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
-  __macro(tan, Tan, TanFunctor, TanGradFunctor);                              \
-  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
-  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
-  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
-  __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor);                          \
-  __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
-  __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);                      \
-  __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);                      \
-  __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);                      \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.kps
similarity index 78%
rename from paddle/fluid/operators/activation_op.cu
rename to paddle/fluid/operators/activation_op.kps
index e578ad899e74b7afb6b966d2afa5695be1e6c5c9..92a101451e211f912e5390171654affa3be4e973 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.kps
@@ -18,28 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
 template <typename T>
 struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
     return x > zero ? dout : static_cast<T>(alpha) * dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return dout * out * (one - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x >= -l && x <= l) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
-};
-
-template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
-  }
-};
-
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
-  }
-};
-
-template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
-  }
-};
-
-template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
-  }
-};
-
-template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
-  }
-};
-
-template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
-  }
-};
-
-template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -469,88 +244,11 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
     return dout * (one - out * out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
-  }
-};
-
-template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
 };
 
-template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
@@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
     return -dout * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
     return dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
     return dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
     return dout / x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
     return dout * two * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
     return one_half * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
     return minus_one_half * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
     return dout / (one + x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_two);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_ten);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
     return (x > t_min_cast && x < t_max_cast) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
                                  : static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * a * b * (one - temp * temp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
     return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
     return dout / (temp * temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < t) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x > -t && x < t) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 + temp3));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     return x > static_cast<T>(threshold) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
     return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
          temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename Functor>
@@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel
     std::vector<const framework::Tensor*> ins = {d_out};
     std::vector<framework::Tensor*> outs = {d_x};
 
-    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+    if (static_cast<int>(Functor::FwdDeps()) ==
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
                                                                 &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
-               static_cast<int>(kDepX)) {
+               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       // Only need forward input X
       ins.push_back(x);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
@@ -1509,7 +1226,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<double>>,                         \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1517,7 +1236,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<double>>,                \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
                                             grad_functor)                      \
@@ -1531,7 +1252,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<int64_t>>,                        \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1543,7 +1266,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<int64_t>>,               \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 /* ======================== leaky relu register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
@@ -1594,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    relu register  ============================ */
-#ifdef PADDLE_WITH_HIP
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
-                                CudaReluGradFunctor);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                    ops::CudaReluFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaReluFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::float16>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                             ops::CudaReluGradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::float16>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::bfloat16>>);
-#endif
-/* ========================================================================== */
-
 /* ===========================    sigmoid register  ============================
  */
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
@@ -1650,7 +1331,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidGradGradFunctor<double>>,
     ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+                                 ops::SigmoidGradGradFunctor<plat::float16>>,
+    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     sigmoid_triple_grad,
@@ -1659,7 +1342,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidTripleGradFunctor<double>>,
     ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
+                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
+    ops::SigmoidTripleGradKernel<
+        plat::CUDADeviceContext,
+        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
@@ -1696,7 +1382,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
                               ops::SqrtGradGradFunctor<double>>,
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
+                              ops::SqrtGradGradFunctor<plat::float16>>,
+    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::SqrtGradGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================   rsqrt register  =============================
@@ -1726,6 +1414,8 @@ REGISTER_OP_CUDA_KERNEL(
                                 ops::SquareGradGradFunctor<double>>,
     ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<plat::float16>>,
+    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
+                                ops::SquareGradGradFunctor<plat::bfloat16>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<int>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -1821,21 +1511,10 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
   __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
           CudaLogSigmoidGradFunctor);                                         \
-  __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor);                  \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
   __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor);               \
-  __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);                      \
-  __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);                      \
-  __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor);                  \
-  __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);                      \
-  __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor);                  \
-  __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor);                  \
-  __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor);                  \
-  __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor);              \
-  __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor);              \
-  __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor);              \
   __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
   __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
           CudaReciprocalGradFunctor);                                         \
@@ -1861,3 +1540,65 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
+
+#ifdef PADDLE_WITH_XPU_KP
+#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
+                                       grad_functor)                           \
+  REGISTER_OP_KERNEL(                                                          \
+      act_type, KP, plat::XPUPlace,                                            \
+      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
+  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
+                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
+                                                   ops::grad_functor<float>>);
+
+REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                               CudaLeakyReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                               CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
+                               CudaReciprocalGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
+                               CudaSoftplusGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
+                               CudaHardSwishGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
+                               CudaCELUGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                               CudaSqrtGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
+                               CudaSquareGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
+                               CudaSiluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
+                               CudaLogSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
+                               CudaSoftShrinkGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
+                               CudaLog1pGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
+                               CudaBReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
+                               CudaSoftReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
+                               CudaSoftsignGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
+                               CudaRelu6GradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
+                               CudaHardShrinkGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
+                               CudaHardSigmoidFunctor,
+                               CudaHardSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
+                               CudaSwishGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
+                               CudaThresholdedReluFunctor,
+                               CudaThresholdedReluGradFunctor);
+
+#endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index de4d7818020dd586547ff9eedb53108285048c09..716a2e40179e404c2afcec31fb72cde7172f7e54 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -147,8 +147,8 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
-                            PT_INFER_META(phi::AddmmInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
+                            PD_INFER_META(phi::AddmmInferMeta));
 REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::framework::OpDesc>,
                   ops::AddMMOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h
index f7aa0de97598df67817d81c1d1c1a5e8356f42ea..56aebe90788fbaa6c300ee9ac620c3d7613ff141 100644
--- a/paddle/fluid/operators/amp/fp16_type_traits.h
+++ b/paddle/fluid/operators/amp/fp16_type_traits.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -32,6 +33,12 @@ class MPTypeTrait<platform::float16> {
   using Type = float;
 };
 
+template <>
+class MPTypeTrait<platform::bfloat16> {
+ public:
+  using Type = float;
+};
+
 }  // namespace details
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
index db5a3ea2961948a241c0424c77eecb0d77183e48..116a8053db3edb724d2c68b93d92ce958fbe8e32 100644
--- a/paddle/fluid/operators/angle_op.h
+++ b/paddle/fluid/operators/angle_op.h
@@ -36,8 +36,8 @@ class AngleKernel : public framework::OpKernel<T> {
 
     auto numel = x->numel();
     auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        context.GetPlace(), size_t(x->numel() * sizeof(phi::funcs::Real<T>)));
+    auto* out_data = out->mutable_data<phi::dtype::Real<T>>(
+        context.GetPlace(), size_t(x->numel() * sizeof(phi::dtype::Real<T>)));
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
@@ -57,7 +57,7 @@ class AngleGradKernel : public framework::OpKernel<T> {
         ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
+    auto* dout_data = d_out->data<phi::dtype::Real<T>>();
     auto* x_data = x->data<T>();
     auto* dx_data = d_x->mutable_data<T>(
         ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 0f5c048b6be9c73ae98181685269592f409196cd..c5e4188ca2d6f749a06127c41da99490a7fb3ffc 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -15,23 +15,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
+
 REGISTER_OPERATOR(
     arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMaxInferShapeFunctor);
+
 REGISTER_OP_VERSION(arg_max)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
deleted file mode 100644
index b77031f7fb4c9d94f30ed06333b9c8766fd2310d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include <limits>
-#include <string>
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {  // NOLINT
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-using Tensor = framework::Tensor;
-
-}  // end namespace
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-template <typename T, typename IndType, class Reducer, size_t BlockDim>
-__global__ void ArgCUDAKernel(const int64_t height,     // n * h
-                              const int64_t width,      // c
-                              const int64_t post_size,  // h
-                              const Reducer reducer, const T init, const T* in,
-                              IndType* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      out[idx] = static_cast<IndType>(kv_pair.key);
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, typename IndType, class Reducer>
-void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
-                    Tensor* indices, const int64_t pre, const int64_t post,
-                    const int64_t n) {
-  auto cu_stream = ctx.stream();
-  auto ComputeBlockSize = [](int64_t col) {
-    auto block_size = 8;
-    if (col > 512)
-      block_size = 1024;
-    else if (col > 256)
-      block_size = 512;
-    else if (col > 128)
-      block_size = 256;
-    else if (col > 64)
-      block_size = 128;
-    else if (col > 32)
-      block_size = 64;
-    else if (col > 16)
-      block_size = 32;
-    else if (col > 8)
-      block_size = 16;
-#ifdef __HIPCC__
-    block_size = std::min(block_size, 256);
-#endif
-    return block_size;
-  };
-
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t height = pre * post;
-  int64_t width = n;
-  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-  const T* in_data = input.data<T>();
-  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  if (typeid(Reducer) == typeid(cub::ArgMax)) {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
-              in_data, out_data));
-    }
-  } else {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::max(),
-              in_data, out_data));
-    }
-  }
-}
-
-template <typename T, class Reducer>
-struct VisitDataCudaArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename IndType>
-  void apply() const {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int axis = ctx.Attr<int64_t>("axis");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    framework::DDim input_dims;
-    if (flatten) {
-      input_dims = phi::make_ddim({input->numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      input_dims = input->dims();
-      if (axis < 0) axis += input->dims().size();
-    }
-
-    int64_t numel = input->numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-
-    const auto& dev_ctx = ctx.cuda_device_context();
-    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
-  }
-};
-template <typename T, class Reducer>
-class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-  }
-};
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index d3ce61d183a3d322e40966ce59f9a10320ceab4f..585341beea12c14fbd01a3a47af34ce57def0db5 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -27,193 +27,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum ArgMinMaxType { kArgMin, kArgMax };
-
-template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
-          ArgMinMaxType argMinMaxValue>
-struct ArgMinMaxFunctor {};
-
-#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
-  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
-  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
-                          enum_argminmax_value> {                             \
-    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, framework::DDim x_dims,        \
-                    int64_t axis, bool keepdims) {                            \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
-      if (keepdims) {                                                         \
-        auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      } else {                                                                \
-        auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);  \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      }                                                                       \
-    }                                                                         \
-  }
-
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-struct VisitDataArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename Tout>
-  void apply() const {
-    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
-    out.template mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto keepdims = ctx.Attr<bool>("keepdims");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-    // paddle do not have the scalar tensor, just return the shape [1] tensor
-    if (flatten) keepdims = true;
-
-    // if flatten, will construct the new dims for the cacluate
-    framework::DDim x_dims;
-    if (flatten) {
-      x_dims = phi::make_ddim({x.numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      x_dims = x.dims();
-      if (axis < 0) axis += x_dims.size();
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
-  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
-      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
-
-    switch (x_dims.size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(
-            x_dims.size(), 6,
-            platform::errors::InvalidArgument(
-                "%s operator doesn't supports tensors whose ranks are greater "
-                "than 6.",
-                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-class ArgMinMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-  }
-};
-
-template <typename DeviceContext, typename T>
-using ArgMinKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMin>;
-
-template <typename DeviceContext, typename T>
-using ArgMaxKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMax>;
-
 class ArgMinMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max");
-    const auto& x_dims = ctx->GetInputDim("X");
-    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
-    bool keepdims = ctx->Attrs().Get<bool>("keepdims");
-    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
-
-    PADDLE_ENFORCE_GE(axis, -x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -Rank(X)(%d).",
-                          axis, -x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
-            x_dims.size()));
-
-    const int& dtype = ctx->Attrs().Get<int>("dtype");
-    PADDLE_ENFORCE_EQ(
-        (dtype < 0 || dtype == 2 || dtype == 3), true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
-    if (ctx->IsRuntime()) {
-      if (dtype == framework::proto::VarType::INT32) {
-        int64_t all_element_num = 0;
-        if (flatten) {
-          all_element_num = phi::product(x_dims);
-
-        } else {
-          all_element_num = x_dims[axis];
-        }
-        PADDLE_ENFORCE_LE(
-            all_element_num, INT_MAX,
-            platform::errors::InvalidArgument(
-                "The element num of the argmin/argmax input at axis is "
-                "%d, is larger than int32 maximum value:%d, you must "
-                "set the dtype of argmin/argmax to 'int64'.",
-                all_element_num, INT_MAX));
-      }
-    }
-    std::vector<int64_t> vec;
-    if (flatten) {
-      vec.emplace_back(static_cast<int64_t>(1));
-    } else {
-      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
-      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(vec));
-  }
 };
 
 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 0a4ba6fb0bfdfccfc4eae99da730e96fe5f0a540..fb3abd01af8c396d764f9f1d247f24c41bd15959 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
 
 REGISTER_OPERATOR(
     arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMinInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
 REGISTER_OP_VERSION(arg_min)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
deleted file mode 100644
index 23170bf0087906d752767051ce58874cb3584ee5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-REGISTER_OP_CUDA_KERNEL(
-    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 9e525c20335d37242d0e239e81d2d2976b92a6b4..1a8aca777370bc140e39b7457702557042541744 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ArgsortOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort");
-
-    auto in_dims = ctx->GetInputDim("X");
-    int axis = ctx->Attrs().Get<int>("axis");
-
-    auto num_dims = in_dims.size();
-    PADDLE_ENFORCE_GE(axis, -num_dims,
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -num_dims(%d).",
-                          axis, -num_dims));
-    PADDLE_ENFORCE_LT(
-        axis, num_dims,
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareDim("X", "Indices");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
 };
 
 class ArgsortGradOp : public framework::OperatorWithKernel {
@@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor,
+                            PD_INFER_META(phi::ArgsortInferMeta));
 REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
                   ops::ArgsortGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>,
+                  ArgsortInferShapeFunctor);
 REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp,
                   ops::ArgsortGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(argsort,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    argsort_grad, ops::ArgsortGradientKernel<paddle::platform::CPUPlace, float>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, double>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
deleted file mode 100644
index 8b7a0b3eadb16bbe0822809748e343dc0d793a0f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.cu
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/argsort_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#ifdef __HIPCC__
-namespace rocprim {
-namespace detail {
-template <>
-struct radix_key_codec_base<paddle::platform::float16>
-    : radix_key_codec_integral<paddle::platform::float16, uint16_t> {};
-}  // namespace detail
-}  // namespace rocprim
-#else
-// set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-template <typename T>
-static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (T j = row_id; j < num_rows; j += gridDim.x) {
-    for (T i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
-                                       int64_t size, T* dX) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < size; i += stride) {
-    dX[indices[i]] = dO[i];
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
-                                IndType num_rows, IndType num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
-    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
-      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
-    }
-  }
-}
-
-// Sort by flag descending, True: descending. False: Ascending.
-// Default is false.
-template <typename T, typename IndType>
-void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
-                 Tensor* output, Tensor* indices, const IndType num_rows,
-                 const IndType num_cols, const bool descending) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-
-  const std::vector<IndType> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<IndType>(ctx.GetPlace());
-
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  // Init a index array
-  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<IndType>(), num_rows, num_cols);
-
-  T* sorted_out_ptr;
-  IndType* sorted_indices_ptr;
-
-  const T* inp = input->data<T>();
-  T* out = output->mutable_data<T>(ctx.GetPlace());
-  IndType* ind = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  sorted_out_ptr = out;
-  sorted_indices_ptr = ind;
-
-  // create iter for counting input
-  cub::CountingInputIterator<IndType> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<IndType, SegmentOffsetIter,
-                              cub::CountingInputIterator<IndType>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  gpuError_t err;
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-}
-
-template <typename T, typename IndType>
-void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                   const Tensor* indices, Tensor* dX, const IndType num_rows,
-                   const IndType num_cols) {
-  auto cu_stream = ctx.stream();
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<IndType>(), dX->data<T>(), num_rows,
-      num_cols);
-}
-
-template <typename T>
-void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                      const Tensor* indices, int64_t size, Tensor* dX) {
-  auto cu_stream = ctx.stream();
-
-  const int64_t block_size =
-      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
-  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
-  const int64_t max_blocks =
-      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
-  const int64_t grid_size =
-      std::min(max_blocks, (size + block_size - 1) / block_size);
-
-  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    auto size = input->numel();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    // Compared to the following 'Special case for full sort', ascending sort is
-    // 34 times faster and descending sort is 31 times faster.
-    if (size == in_dims[axis]) {
-      thrust::sequence(thrust::device, ids_data, ids_data + size);
-      thrust::copy(thrust::device, in_data, in_data + size, out_data);
-      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
-      if (descending) {
-        thrust::reverse(thrust::device, out_data, out_data + size);
-        thrust::reverse(thrust::device, ids_data, ids_data + size);
-      }
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      ArgFullSort<T, int64_t>(dev_ctx, input, output, indices, input_height,
-                              input_width, descending);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      T* trans_inp_data = trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-      // temp indices for sorting
-      tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-
-      ArgFullSort<T, int64_t>(dev_ctx, &trans_inp, &tmp_out, &tmp_indices,
-                              input_height, input_width, descending);
-
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                   output, trans);
-      return;
-    }
-  }
-};
-
-template <typename T>
-class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    if (dO->numel() == 0) return;
-
-    auto in_dims = dX->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    int64_t size = dX->numel();
-    const auto& dev_ctx = ctx.cuda_device_context();
-
-    // Parallel acceleration when the input size is equal to the length of the
-    // ‘axis’ dimension.
-    // Compared to 'special case for full sort' below, the gradient calculation
-    // is 10 times faster.
-    if (size == in_dims[axis]) {
-      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
-                                input_width);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
-                                                   &trans_dO, trans);
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      ArgFullAssign<T, int64_t>(dev_ctx, &trans_dO, &trans_ind, &tmp_out,
-                                input_height, input_width);
-
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                   trans);
-      return;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    argsort,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
-    paddle::operators::ArgsortGradOpCUDAKernel<double>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
deleted file mode 100644
index d850e51a4bf061d3e5fc46bd53a2ef56610d6de9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename Type>
-static void FullSort(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     bool descending) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                if (descending)
-                  return l.first > r.first;
-                else
-                  return l.first < r.first;
-              });
-
-    for (Type j = 0; j < input_width; ++j) {
-      t_out[i * input_width + j] = col_vec[j].first;
-      t_indices[i * input_width + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullAssign(Type input_height, Type input_width, int input_dim,
-                       const framework::Tensor* input,
-                       const framework::Tensor* indices, T* t_out) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-    // Do full sort
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           out_data, ids_data, descending);
-    } else {
-      // If not full sort do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, descending);
-
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ArgsortGradientKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    // Do full assign
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(), dO,
-                             indices, dX->data<T>());
-    } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *dO,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                  trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 077be715bece0b4119dc0a578a1cba4631eb45f2..c927eec00bc8bf9e84ad1fb53a907ff8ec71acbc 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
index 18e81936a16c63a1d2693dfb47dc618c3e707ae0..359b00fcf87ee1bee27e668ae3973fa39be19d76 100644
--- a/paddle/fluid/operators/argsort_op_xpu.cc
+++ b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 72488a932d9c33cbfeddc9f35818e42ebe0137fa..b452dea8536dd98d6d4060d5224e39daf9137c50 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc
index 71a895c244c54f62c0af1745635c08fea35436c4..0783b30a8580db403255211d879d9400a1e82ab7 100644
--- a/paddle/fluid/operators/atan2_op.cc
+++ b/paddle/fluid/operators/atan2_op.cc
@@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
-                            PT_INFER_META(phi::Atan2InferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
+                            PD_INFER_META(phi::Atan2InferMeta));
 REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker,
                   ops::Atan2GradMaker<paddle::framework::OpDesc>,
                   ops::Atan2GradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index a23e484d0a88bb87febc6d320f9183ef50ea0ebc..78ea8b6b6fbebd7e0ca5ce14cc2cba6ff197177f 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -269,10 +269,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
   if (bias) {
-    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::avx>(n, y, y);
+    phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y);
+    phi::funcs::vec_relu<T, platform::avx>(n, y, y);
   } else {
-    math::vec_relu<T, platform::avx>(n, x, y);
+    phi::funcs::vec_relu<T, platform::avx>(n, x, y);
   }
 }
 
@@ -283,14 +283,14 @@ inline void vec_softmax(const int n, const T* x, T* y) {
   for (int i = 1; i < n; ++i) {
     scalar = scalar < x[i] ? x[i] : scalar;
   }
-  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                               // exp
+  phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
+  phi::funcs::vec_exp<T>(n, y, y);                               // exp
   // sum
   scalar = T(0);
   for (int i = 0; i < n; ++i) {
     scalar += y[i];
   }
-  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
+  phi::funcs::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }
 
 template <typename T>
@@ -344,12 +344,12 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
     auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
     if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
+      phi::funcs::VecActivations<T, platform::avx> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
     } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 949cf021cf0fa322970c210fa26f698fd2bc45b2..174207deb08b84194d6f20fe04e4c27245295caf 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
                   ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
                   ops::BatchNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index d59396db1517faadaa2dd9e9af770d2e8a23ec56..a19b087245a89a4a12f062b1ce27835b98ecfd66 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-template <typename T, framework::DataLayout layout>
-static __global__ void BNForwardInference(
-    const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int num = N * C * HxW;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> x_sub_mean =
-        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
-    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
-    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
-    const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, double exponentialAverageFactor, T *y,
-    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      x_sum += x_i;
-      x_square_sum += x_i * x_i;
-    }
-    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-    x_square_sum =
-        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      mean_val = x_sum / inner_size;
-      variance_val = x_square_sum / inner_size - mean_val * mean_val;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    bool test_mode = is_test && (!trainable_stats);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5"
-            "But received: the size of input's dimensions is [%d]",
-            x_dims.size()));
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        test_mode ||
-        (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
-
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_y(y->type());
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
-                                                           &transformed_y);
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-    } else if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#else
-    if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(
-//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_,
-        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    // Now, depending on whether we are running test or not, we have two paths.
-    // It is training mode when it's not reference AND not using pre-trained
-    // model.
-    bool training = !test_mode && !use_global_stats;
-    if (!training) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of mean's dimensions must equal to 1."
-              "But received: the size of mean's dimensions mean is [%d],"
-              "the dimensions of mean is [%s].",
-              est_mean->dims().size(), est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of variance's dimensions must equal to 1."
-              "But received: the size of variance's dimensions is [%d],"
-              "the dimensions of variance is [%s].",
-              est_var->dims().size(), est_var->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of mean must equal to the number of "
-              "Channels, which is [%d]. But received: the first dimension"
-              "of mean is [%d], the dimensions of mean is [%s].",
-              C, est_mean->dims()[0], est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of variance must equal to the number"
-              "of Channels, which is [%d]. But received: the first dimension of"
-              "variance is [%d], the dimensions of variance is [%s].",
-              C, est_var->dims()[0], est_var->dims()));
-
-#ifdef PADDLE_WITH_HIP
-      const int block_size = 256;
-      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
-      if (compute_format == DataLayout::kNCHW) {
-        BNForwardInference<
-            T,
-            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      } else {
-        BNForwardInference<
-            T,
-            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardInference(
-//         handle, miopenBNSpatial,
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_mean->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_var->template data<BatchNormParamType<T>>())),
-//         epsilon));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationForwardInference(
-              handle,
-              // Note: PERSISTENT not implemented for inference
-              CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_,
-              transformed_x.template data<T>(), data_desc_,
-              transformed_y.template mutable_data<T>(ctx.GetPlace()),
-              bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-              bias->template data<BatchNormParamType<T>>(),
-              est_mean->template data<BatchNormParamType<T>>(),
-              est_var->template data<BatchNormParamType<T>>(), epsilon));
-#endif
-    } else {
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        Tensor mom_cpu;
-        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
-                                          &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      // Run training mode.
-      // obtain running mean and running inv var, and there is no need
-      // to initialize them.
-
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-      } else {
-        double this_factor = 1. - momentum;
-
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        size_t reserve_space_size = 0;
-        void *reserve_space_ptr = nullptr;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        // Create reserve space and workspace for batch norm.
-        // Create tensor for each batchnorm op, it will be used in the
-        // backward. Thus this tensor shouldn't be temp.
-        auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
-        PADDLE_ENFORCE_NOT_NULL(
-            reserve_space,
-            platform::errors::NotFound(
-                "The argument ReserveSpace of batch_norm op is not found."));
-
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*zDesc=*/nullptr,
-                    /*yDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*activationDesc=*/nullptr,
-                    /*xDesc=*/data_desc_,
-                    /*sizeInBytes=*/&reserve_space_size));
-
-        reserve_space_ptr = reserve_space->mutable_data(
-            ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(), data_desc_,
-                transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
-                transformed_y.template data<T>(), bn_param_desc_,
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
-                reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          const int num = transformed_x.numel();
-          const int block = 256;
-          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-          const int max_blocks = std::max(max_threads / block, 1);
-          const int grid = std::min(C, max_blocks);
-          if (compute_format == DataLayout::kNCHW) {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          } else {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardTraining(
-//         handle, mode_, const_cast<void *>(static_cast<const void *>(
-//                            CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         this_factor,
-//         static_cast<void *>(
-//             mean_out->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(variance_out->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace())),
-//         epsilon,
-//         static_cast<void *>(
-//             saved_mean->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(saved_variance->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationForwardTraining(
-                  handle, mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_y.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  bias->template data<BatchNormParamType<T>>(), this_factor,
-                  mean_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  variance_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon,
-                  saved_mean->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  saved_variance->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace())));
-#endif
-        }
-      }
-    }
-
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_y, y);
-    }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const double epsilon, const int N,
-    const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
-    BatchNormParamType<T> mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
-                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_sum * inv_var_i;
-      dbias[i] = db_sum;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
-template <typename T>
-static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
-                                       const BatchNormParamType<T> *scale,
-                                       const BatchNormParamType<T> *bias,
-                                       const BatchNormParamType<T> *mean,
-                                       const BatchNormParamType<T> *variance,
-                                       double epsilon, int C, int M,
-                                       const int num, const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
-    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
-    x[i] = static_cast<T>(x_i);
-  }
-}
-
-template <typename T>
-class InplaceHelper {
- public:
-  void operator()(const framework::DataLayout layout, T *x,
-                  const BatchNormParamType<T> *scale,
-                  const BatchNormParamType<T> *bias,
-                  const BatchNormParamType<T> *mean,
-                  const BatchNormParamType<T> *variance, double epsilon, int C,
-                  int M, const int num, const T *y, int grid2, const int block,
-                  const gpuStream_t &stream) {
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y should be inplaced in inplace mode"));
-    KeBNRestoreData<<<grid2, block, 0, stream>>>(
-        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
-    const T *dy, const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *saved_mean,
-    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
-    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> inv_var_val;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> dscale_val;
-  __shared__ BatchNormParamType<T> dbias_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    if (saved_mean && saved_inv_variance) {
-      if (threadIdx.x == 0) {
-        inv_var_val = saved_inv_variance[i];
-        mean_val = saved_mean[i];
-      }
-    } else {
-      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-      BatchNormParamType<T> x_square_sum =
-          static_cast<BatchNormParamType<T>>(0);
-
-      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-        const int index = layout == framework::DataLayout::kNCHW
-                              ? (j / HxW * C + i) * HxW + j % HxW
-                              : j * outer_size + i;
-        BatchNormParamType<T> x_i =
-            static_cast<BatchNormParamType<T>>(x[index]);
-        x_sum += x_i;
-        x_square_sum += x_i * x_i;
-      }
-      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-      x_square_sum =
-          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-      if (threadIdx.x == 0) {
-        mean_val = x_sum / inner_size;
-        inv_var_val =
-            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
-      }
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      ds_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
-      db_sum += dy_i;
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale_val = ds_sum * inv_var_val;
-      dbias_val = db_sum;
-      dscale[i] = dscale_val;
-      dbias[i] = dbias_val;
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] = scale[i] * inv_var_val *
-                  (static_cast<BatchNormParamType<T>>(dy[index]) -
-                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
-                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
-                       inv_var_val * dscale_val / inner_size);
-    }
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
-    const T *dy, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *mean, const T *x,
-    const BatchNormParamType<T> *variance, const int C, const int N,
-    const int HxW, T *dx) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> inv_var_i = variance[i];
-    BatchNormParamType<T> mean_i = mean[i];
-    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> dy_x_sub_mean_sum =
-        static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      dy_sum += dy_i;
-      dy_x_sub_mean_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-    }
-
-    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
-                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dy_sum_val = dy_sum;
-      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] =
-          (static_cast<BatchNormParamType<T>>(dy[index]) -
-           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
-           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
-               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
-          scale[i] * inv_var_i;
-    }
-  }
-}
-
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    const bool is_test = ctx.Attr<bool>("is_test");
-    use_global_stats = is_test || use_global_stats;
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5."
-            "But received: the size of input's dimensions is [%d],"
-            "the dimensions of input is [%s]",
-            x_dims.size(), x_dims));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(
-        scale->dims().size(), 1UL,
-        platform::errors::InvalidArgument(
-            "The size of scale's dimensions must equal to 1. But received: "
-            "the size of scale's dimensions is [%d], the dimensions of scale "
-            "is [%s].",
-            scale->dims().size(), scale->dims()));
-    PADDLE_ENFORCE_EQ(
-        scale->dims()[0], C,
-        platform::errors::InvalidArgument(
-            "The first dimension of scale must equal to Channels[%d]. But "
-            "received: the first dimension of scale is [%d]",
-            C, scale->dims()[0]));
-
-    auto dtype = platform::CudnnDataType<T>::type;
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
-        reserve_space != nullptr;
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_d_y(d_y->type());
-    Tensor transformed_d_x;
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                           &transformed_d_y);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                          &transformed_d_y);
-      if (d_x) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
-                                                             &transformed_d_x);
-      }
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_d_y.ShareDataWith(*d_y);
-      if (d_x) {
-        transformed_d_x.ShareDataWith(*d_x);
-      }
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int num = transformed_x.numel();
-#ifdef HIPCC
-    const int block = 256;
-#else
-    const int block = 512;
-#endif
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-    auto stream = dev_ctx.stream();
-    InplaceHelper<T> inplace_functor;
-
-    if (!use_global_stats) {
-      if ((N * H * W * D) == 1) {
-        if (d_x) {
-          framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-        }
-        phi::funcs::SetConstant<platform::CUDADeviceContext,
-                                BatchNormParamType<T>>
-            functor;
-        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-        return;
-      }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-      cudnnTensorDescriptor_t data_desc_;
-      cudnnTensorDescriptor_t bn_param_desc_;
-      cudnnBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-        LOG(ERROR) << "Provided epsilon is smaller than "
-                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                   << "CUDNN_BN_MIN_EPSILON instead.";
-      }
-      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-      } else if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#else
-      if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-//                                                       data_desc_, mode_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                           data_desc_, mode_));
-#endif
-
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-      const auto *saved_mean_data =
-          saved_mean->template data<BatchNormParamType<T>>();
-      const auto *saved_var_data =
-          saved_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        inplace_functor(compute_format, transformed_x.data<T>(),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        saved_mean_data, saved_var_data, epsilon, C, H * W * D,
-                        num, transformed_x.data<T>(), grid2, block, stream);
-      }
-
-      // This branch calls CUDNN APIs
-      if (d_x && d_scale && d_bias) {
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        auto reserve_space_size = reserve_space->memory_size();
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-                    /*handle=*/dev_ctx.cudnn_handle(),
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*yDesc=*/data_desc_,
-                    /*dyDesc=*/data_desc_,
-                    /*dzDesc=*/nullptr,
-                    /*dxDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationBackwardEx(
-                /*handle=*/dev_ctx.cudnn_handle(),
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-                /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-                /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-                /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-                /*xDesc=*/data_desc_,
-                /*xData=*/transformed_x.template data<T>(),
-                /*yDesc=*/nullptr,
-                /*yData=*/nullptr,
-                /*dyDesc=*/data_desc_,
-                /*dyData=*/transformed_d_y.template data<T>(),
-                /*dzDesc=*/nullptr,
-                /*dzData=*/nullptr,
-                /*dxDesc=*/data_desc_,
-                /*dxData=*/transformed_d_x.template mutable_data<T>(
-                    ctx.GetPlace()),
-                /*dBnScaleBiasDesc=*/bn_param_desc_,
-                /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
-                /*bnBiasData=*/nullptr,
-                /*dBnScaleData=*/d_scale
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*dBnBiasData=*/d_bias
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*epsilon=*/epsilon,
-                /*savedMean=*/saved_mean_data,
-                /*savedInvVariance=*/saved_var_data,
-                /*activationDesc=*/nullptr,
-                /*workspace=*/workspace_ptr,
-                /*workSpaceSizeInBytes=*/workspace_size,
-                /*reserveSpace=*/const_cast<T *>(
-                    reserve_space->template data<T>()),
-                /*reserveSpaceSizeInBytes=*/reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          if (compute_format == DataLayout::kNCHW) {
-            BNBackward<
-                T, block,
-                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          } else {
-            BNBackward<
-                T, block,
-                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationBackward(
-//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), data_desc_,
-//         transformed_x.template data<T>(), data_desc_,
-//         transformed_d_y.template data<T>(), data_desc_,
-//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-//         d_scale->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         d_bias->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         epsilon, saved_mean_data, saved_var_data));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
-#endif
-        }
-
-        if (data_layout == DataLayout::kNHWC &&
-            compute_format == DataLayout::kNCHW) {
-          VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-          TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-              ctx, &transformed_d_x, d_x);
-        }
-      } else {
-        // This branch call CUDA kernels
-        if (compute_format == DataLayout::kNCHW) {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        } else {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        }
-      }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-      // clean when exit.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_var = ctx.Input<Tensor>("Variance");
-
-      const auto *running_mean_data =
-          running_mean->template data<BatchNormParamType<T>>();
-      const auto *running_var_data =
-          running_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        auto px = *x;
-        inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        running_mean_data, running_var_data, epsilon, C,
-                        H * W * D, num, x->data<T>(), grid2, block, stream);
-      }
-
-      if (compute_format == DataLayout::kNCHW) {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      } else {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
-        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
-        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
-#endif
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 55bb57466c7b5ec4f4ac3c51b1cf84ab5098a0e9..bc9076f4d7c368f60187e9e432dd175d1f5ad45b 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
-                            PT_INFER_META(phi::BCELossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
+                            PD_INFER_META(phi::BCELossInferMeta));
 
 REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker,
                   ops::BCELossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 4774c0a1dbc3b78607d75efb7bc82d590ca4aa2a..9f6a78ab7a55f32558accd56e69d757003bad89c 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
+DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
                             BilinearTensorProductInferShapeFunctor,
-                            PT_INFER_META(phi::BilinearTensorProductInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(
+                            PD_INFER_META(phi::BilinearTensorProductInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
     bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor,
-    PT_INFER_META(phi::BilinearTensorProductGradInferMeta));
+    PD_INFER_META(phi::BilinearTensorProductGradInferMeta));
 
 REGISTER_OPERATOR(
     bilinear_tensor_product, ops::BilinearTensorProductOp,
diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc
index b37334a14bad4fdc342d8fba13c117bfad5bd65c..062e7d510d54c0f657582d48844093d94732971e 100644
--- a/paddle/fluid/operators/bincount_op.cc
+++ b/paddle/fluid/operators/bincount_op.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bincount_op.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BincountOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of BincountOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto minlength = ctx->Attrs().Get<int>("minlength");
-
-    PADDLE_ENFORCE_GE(minlength, 0,
-                      platform::errors::InvalidArgument(
-                          "The minlength should be greater than or equal to 0."
-                          "But received minlength is %d",
-                          minlength));
-
-    PADDLE_ENFORCE_EQ(input_dim.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) must be 1-D tensor."
-                          "But the dimension of Input(X) is [%d]",
-                          input_dim.size()));
-
-    if (ctx->HasInput("Weights")) {
-      auto weights_dim = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The 'shape' of Input(Weights) must be 1-D tensor."
-                            "But the dimension of Input(Weights) is [%d]",
-                            weights_dim.size()));
-
-      PADDLE_ENFORCE_EQ(
-          weights_dim[0], input_dim[0],
-          platform::errors::InvalidArgument(
-              "The 'shape' of Input(Weights) must be equal to the 'shape' of "
-              "Input(X)."
-              "But received: the 'shape' of Input(Weights) is [%s],"
-              "the 'shape' of Input(X) is [%s]",
-              weights_dim, input_dim));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type =
@@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor,
+                            PD_INFER_META(phi::BincountInferMeta));
 REGISTER_OPERATOR(
     bincount, ops::BincountOp, ops::BincountOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    bincount, ops::BincountKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    BincountInferShapeFunctor);
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
deleted file mode 100644
index cc576d0af92877dff44d672597596036be0defbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bincount_op.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/bincount_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-inline int GET_BLOCKS(const int N) {
-  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
-}
-
-template <typename T, typename InputT, typename OutT>
-__global__ void KernelBincount(const InputT* input, const int total_elements,
-                               const bool has_weights, const T* weights,
-                               OutT* output) {
-  if (!has_weights) {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
-    }
-  } else {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]],
-                                      static_cast<OutT>(weights[i]));
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountCUDAInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  const int input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<T>(context.GetPlace());
-    return;
-  }
-  auto input_x = framework::EigenVector<InputT>::Flatten(*input);
-
-  framework::Tensor input_min_t, input_max_t;
-  auto* input_max_data =
-      input_max_t.mutable_data<InputT>({1}, context.GetPlace());
-  auto* input_min_data =
-      input_min_t.mutable_data<InputT>({1}, context.GetPlace());
-
-  auto input_max_scala = framework::EigenScalar<InputT>::From(input_max_t);
-  auto input_min_scala = framework::EigenScalar<InputT>::From(input_min_t);
-
-  auto* place = context.template device_context<DeviceContext>().eigen_device();
-  input_max_scala.device(*place) = input_x.maximum();
-  input_min_scala.device(*place) = input_x.minimum();
-
-  Tensor input_min_cpu, input_max_cpu;
-  paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
-                                    &input_max_cpu);
-  paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
-                                    &input_min_cpu);
-
-  InputT input_min = input_min_cpu.data<InputT>()[0];
-
-  PADDLE_ENFORCE_GE(
-      input_min, static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size =
-      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
-
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
-
-  auto stream =
-      context.template device_context<platform::CUDADeviceContext>().stream();
-
-  if (!has_weights) {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-
-    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        input_data, input_numel, has_weights, weights_data, output_data);
-  } else {
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-
-      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-
-      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
-                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountCUDAInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountCUDAInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bincount, ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
deleted file mode 100644
index 84256bf78e4a1901b76b356c5e3274541dc0dd59..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bincount_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  auto input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<InputT>(context.GetPlace());
-    return;
-  }
-
-  PADDLE_ENFORCE_GE(
-      *std::min_element(input_data, input_data + input_numel),
-      static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size = static_cast<int64_t>(*std::max_element(
-                            input_data, input_data + input_numel)) +
-                        1L;
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  if (has_weights) {
-    const T* weights_data = weights->data<T>();
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
-      }
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
-      }
-    }
-
-  } else {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-    for (int64_t i = 0; i < input_numel; i++) {
-      output_data[input_data[i]] += 1L;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 27b1107675d4e722f9a2e25801ecc4dfb206cce5..1063a8b7992153dbedcdc0442ac3d8038c5e171b 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "broadcast_tensors");
-
-    int target_rank = 0;
-    const auto& input_dims = ctx->GetInputsDim("X");
-
-    // 1. Find Output rank = max(Inputs rank)
-    for (const auto& input_ddim : input_dims) {
-      target_rank = std::max(target_rank, input_ddim.size());
-    }
-
-    PADDLE_ENFORCE_GT(
-        target_rank, 0,
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp requires at least one input tensor"
-            "to have rank greater than zero"));
-
-    std::vector<int64_t> target_dims(target_rank, 0);
-    // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
-    for (int index = 0; index < target_rank; index++) {
-      // Loop axes in reverse order,
-      // For each axis, take the maximum as target size
-      // Fill size = 1 if shape vector exhausts
-      int target_dim_size = 1;
-      for (const auto& input_ddim : input_dims) {
-        // Reversed order
-        int axis = static_cast<int>(input_ddim.size()) - index - 1;
-        int dim_size = 1;
-        if (axis >= 0) {
-          dim_size = input_ddim[axis];
-        }
-
-        if (target_dim_size != 1 && dim_size != 1 &&
-            target_dim_size != dim_size) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "BroadcastTensorsOp inputs does not satisfy bcast semantics,"
-              "Please check axis = %d in reverse order",
-              index));
-        }
-
-        // We performed bcast semantics check at python level
-        // So input tensors should all have legal shape
-        target_dim_size = std::max(target_dim_size, dim_size);
-      }
-      target_dims[target_rank - index - 1] = target_dim_size;
-    }
-
-    // 3. Set Output Dim
-    std::vector<DDim> output_ddims;
-    for (size_t i = 0; i < input_dims.size(); i++) {
-      output_ddims.emplace_back(phi::make_ddim(target_dims));
-    }
-    ctx->SetOutputsDim("Out", output_ddims);
-    ctx->ShareAllLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+                            BroadcastTensorsInferShapeFunctor,
+                            PD_INFER_META(phi::BroadcastTensorsInferMeta));
+
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
                   ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
                   ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BroadcastTensorsOpVarTypeInference);
+                  ops::BroadcastTensorsOpVarTypeInference,
+                  BroadcastTensorsInferShapeFunctor);
 
 REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
                   ops::BroadcastTensorsGradOpVarTypeInference,
                   ops::BroadcastTensorsGradNoNeedBufVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors_grad,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      plat::float16>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      double>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
deleted file mode 100644
index 5882258317d7daa6c62905f8a76d5c68060787a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-
-template <typename T>
-class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const DDim& input_dims = input_tensor->dims();
-      const DDim& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // Collect reduce_dims
-      // Example:
-      // dX  = [1,1,1,1]
-      // dOut = [1,1,1,4]
-      //
-      // reduce_dims  = [3] // reduce along the broadcasted axis
-      std::vector<int> reduce_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // Turns out to be a No-Op, simply copy tensors
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        // reduce_sum implementation on CUDA
-        auto stream = context.cuda_device_context().stream();
-        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            context.cuda_device_context(), *input_tensor, output_tensor,
-            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
-                        ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
-                        ops::CUDABroadcastTensorsGradOpKernel<float>,
-                        ops::CUDABroadcastTensorsGradOpKernel<double>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
deleted file mode 100644
index 682f2e24769221d04317d0e53d02406c4c5a26eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.h
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define SWITCH_OUT_RANK_CASE(n)                                \
-  case n: {                                                    \
-    ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
-    break;                                                     \
-  }
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-using framework::EigenTensor;
-
-template <typename DeviceContext, typename T>
-class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto& in_tensors = context.MultiInput<Tensor>("X");
-    auto out_tensors = context.MultiOutput<Tensor>("Out");
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // Eigen has no support for dynamic ranked tensor
-    // Thus we perform static expansion for each possible ranks
-    for (size_t i = 0; i < num_ins; i++) {
-      int out_rank = out_tensors[i]->dims().size();
-      switch (out_rank) {
-        SWITCH_OUT_RANK_CASE(1)
-        SWITCH_OUT_RANK_CASE(2)
-        SWITCH_OUT_RANK_CASE(3)
-        SWITCH_OUT_RANK_CASE(4)
-        SWITCH_OUT_RANK_CASE(5)
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Target tensor rank out of range"
-              "Maximum supported rank for broadcast is: 5"));
-        }
-      }
-    }
-  }
-
-  template <int OutRank>
-  void ApplyBroadcast(const framework::ExecutionContext& context,
-                      const Tensor* input_tensor, Tensor* output_tensor) const {
-    const auto& input_dims = input_tensor->dims();
-    const auto& output_dims = output_tensor->dims();
-
-    int in_rank = input_dims.size();
-    int out_rank = output_dims.size();
-
-    // 1. Collect bcast_dims, each element of which indicates how many
-    // times we need to replicate along the corresponding dimension
-    // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
-    // both input and output tensors, so we need to initialize input X with
-    // expanded dims: "new_input_dims_vec"
-    Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
-    std::vector<int64_t> new_input_dims_vec(out_rank);
-    for (int j = 0; j < out_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
-
-      bcast_dims[out_axis] = output_dims[out_axis];
-      new_input_dims_vec[out_axis] = 1;
-      if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
-        bcast_dims[out_axis] = 1;
-        new_input_dims_vec[out_axis] = input_dims[in_axis];
-      }
-    }
-    auto new_input_dims = phi::make_ddim(new_input_dims_vec);
-
-    // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
-    // output
-    auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
-
-    output_tensor->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
-                                                                    bcast_dims);
-  }
-};
-
-#define SWITCH_RESHAPE_DIMS(n)                                                \
-  case n: {                                                                   \
-    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
-    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
-      reshape_dims[i] = reshape_dims_vec[i];                                  \
-    }                                                                         \
-    dX.device(place) =                                                        \
-        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
-    break;                                                                    \
-  }
-
-#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
-  case m: {                                               \
-    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
-    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
-      reduce_dims[i] = reduce_dims_vec[i];                \
-    }                                                     \
-    switch (reshape_size) {
-#define LOWER_SWITCH_REDUCE_DIMS                             \
-  default: {                                                 \
-    PADDLE_THROW(platform::errors::InvalidArgument(          \
-        "Detected reshape size: %d out of range"             \
-        "Minimum value should be larger than reduce size %d" \
-        "While maximum supported is: 5",                     \
-        reshape_size, reduce_size));                         \
-  }                                                          \
-    }                                                        \
-    break;                                                   \
-    }
-
-/* ----- GradOpKernel ----- */
-template <typename DeviceContext, typename T>
-class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      const auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const auto& input_dims = input_tensor->dims();
-      const auto& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
-      // Here we perform the following Eigen operations:
-      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-      // reshape(dX_shape) -> dX
-      // Note the last "reshape(dX_shape)" will be performed implicitly,
-      // and we only need to collect reduce_dims and reshape_dims
-      std::vector<int> reduce_dims_vec;
-      std::vector<int> reshape_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        reshape_dims_vec.push_back(input_dims[j]);
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      size_t reduce_size = reduce_dims_vec.size();
-      size_t reshape_size = reshape_dims_vec.size();
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // If this turns out to be a No-Op, simply perform a tensor copy
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "The number of dimensions of the input "
-                              "'Out@GRAD' for Op(broadcast_tensors)"
-                              " must be greater than or equal to 1, but "
-                              "the value received is %d.",
-                              reduce_dims_vec.size()));
-        PADDLE_ENFORCE_LE(
-            reduce_dims_vec.size(), 5,
-            platform::errors::InvalidArgument(
-                "The number of dimensions of the input 'Out@GRAD' "
-                "for Op(broadcast_tensors) must be less than or equal "
-                "to 5, but the value received is %d.",
-                reduce_dims_vec.size()));
-
-        // Overall:
-        // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-        // reshape(dX_shape) -> dX
-        auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
-        auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-
-        // Expand ReduceSize and ReshapeSize into static values
-        switch (reduce_size) {
-          UPPER_SWITCH_REDUCE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(5)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          default: {
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "Detected reduce size: %d out of range"
-                "While maximum supported is: 5",
-                reduce_size));
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 5c7dd0e2561fa41313b2e65a443a9e4913a39961..eb51215790bbcdbc9e7d0c3adad482d9a69324b9 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
       ops::CastOpKernel<CUDA, plat::complex<float>>,                      \
       ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
 
-#if !defined(PADDLE_WITH_HIP)
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
 REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
-#else
-REGISTER_CAST_CUDA_BASE(transfer_dtype)
-#endif
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 09e915a6bafd4a8b72f35995b3ebbfeafa00476a..ed80ac076c0af7fc8922f095d4be4613bc5057ec 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
-                            PT_INFER_META(phi::CholeskyInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskyInferMeta));
 REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
                   ops::CholeskyGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc
index 6b5bae8fc73fe2b71212a93144d89144dd0268c6..5403e2440ee58f1cf7cbad107f4d3e174655ed3b 100644
--- a/paddle/fluid/operators/cholesky_solve_op.cc
+++ b/paddle/fluid/operators/cholesky_solve_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker {
 class CholeskySolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve");
-    auto u_dims = context->GetInputDim("Y");
-    auto b_dims = context->GetInputDim("X");
-    int u_rank = u_dims.size();
-    int b_rank = b_dims.size();
-    PADDLE_ENFORCE_GE(u_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input Y must greater or equal to 2"));
-    PADDLE_ENFORCE_GE(b_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input X must greater or equal to 2"));
-    PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2],
-                      platform::errors::InvalidArgument(
-                          "input Matrix Y should be square matrix,"
-                          "But Got last shape of %ld x %ld",
-                          u_dims[u_rank - 1], u_dims[u_rank - 2]));
-    PADDLE_ENFORCE_EQ(
-        b_dims[b_rank - 2], u_dims[u_rank - 2],
-        platform::errors::InvalidArgument(
-            "the first dim of input X must equal to the dim of input Y,"
-            "But Got %ld and %ld",
-            b_dims[b_rank - 2], u_dims[u_rank - 2]));
-
-    std::vector<int64_t> u_dims_vec = phi::vectorize(u_dims);
-    std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
-
-    std::vector<int64_t> u_dims_vec_cut(u_dims_vec.begin(),
-                                        u_dims_vec.end() - 2);
-    std::vector<int64_t> b_dims_vec_cut(b_dims_vec.begin(),
-                                        b_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut);
-
-    std::vector<int64_t> b_broadcast_dims({expand_batch_portion});
-    b_broadcast_dims.insert(b_broadcast_dims.end(),
-                            {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims));
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskySolveInferMeta));
+
 REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp,
                   ops::CholeskySolveOpMaker,
                   ops::CholeskySolveOpVarTypeInference,
                   ops::CholeskySolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>,
+                  CholeskySolveInferShapeFunctor);
 
 REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, double>);
-// Complex<> is not supported because of TensorExpand, which used to boardcast
-// input Tensor
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
deleted file mode 100644
index 1b551a7cd0343db32a84e962212a25e1ff5a4893..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo,
-                    int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb,
-                    int *devInfo);
-
-template <>
-void cusolver_potrs<float>(const cusolverDnHandle_t &cusolverH,
-                           cublasFillMode_t uplo, int n, int nrhs, float *Adata,
-                           int lda, float *Bdata, int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<double>(const cusolverDnHandle_t &cusolverH,
-                            cublasFillMode_t uplo, int n, int nrhs,
-                            double *Adata, int lda, double *Bdata, int ldb,
-                            int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<float>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<float> *Adata, int lda, platform::complex<float> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs(
-      cusolverH, uplo, n, nrhs, reinterpret_cast<const cuComplex *>(Adata), lda,
-      reinterpret_cast<cuComplex *>(Bdata), ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<double>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<double> *Adata, int lda, platform::complex<double> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs(
-      cusolverH, uplo, n, nrhs,
-      reinterpret_cast<const cuDoubleComplex *>(Adata), lda,
-      reinterpret_cast<cuDoubleComplex *>(Bdata), ldb, devInfo));
-}
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-
-    /* step 1: get cusolver handle*/
-    auto cusolverH = dev_ctx.cusolver_dn_handle();
-
-    /* step 2: solve A0*X0 = B0  */
-    cusolver_potrs<T>(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda,
-                      devInfo);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-  }
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor &in, Tensor *out,
-                  const framework::ExecutionContext &ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
deleted file mode 100644
index f25fbbb0c698036951c4b9ae8e9ad2778786a1a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
-
-namespace paddle {
-namespace operators {  // namespace operators
-
-template <typename DeviceContext, typename T>
-class CholeskySolveFunctor {
- public:
-  void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo);
-};
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    char uplo = upper ? 'U' : 'L';
-    phi::funcs::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
-                                       devInfo);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
-                       const framework::Tensor &uin,
-                       const framework::Tensor &bin, framework::Tensor *out,
-                       bool upper) {
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  // framework::Tensor broadcast
-  std::vector<int64_t> u_bst_dims_vec;
-  std::vector<int64_t> b_bst_dims_vec;
-  std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin);
-  framework::Tensor u_bst(uin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, uin, &u_bst, u_bst_dims_vec);
-
-  framework::Tensor b_bst(bin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
-
-  math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
-
-  // calculate u's conjugate for complex
-  framework::Tensor u_conj(u_bst.type());
-  platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
-  phi::funcs::ConjFunctor<T> u_functor(
-      u_bst.data<T>(), u_bst.numel(),
-      u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
-  u_for_range(u_functor);
-  u_conj = helper.Transpose(u_conj);
-
-  // calculate b's conjugate for complex
-  framework::Tensor b_conj(b_bst.type());
-  platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
-  phi::funcs::ConjFunctor<T> b_functor(
-      b_bst.data<T>(), b_bst.numel(),
-      b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
-  b_for_range(b_functor);
-  b_conj = helper.Transpose(b_conj);
-
-  auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
-  auto uindims = u_bst.dims();
-  auto bindims = b_bst.dims();
-  int uinrank = uindims.size();
-  int binrank = bindims.size();
-
-  int n = uindims[uinrank - 2];
-  int nrhs = bindims[binrank - 1];
-  int ldab = std::max(1, n);
-
-  // framework::Tensor out_copy(b_conj.type());
-  // out_copy.Resize(b_conj.dims());
-  framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out);
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2);
-  auto batchsize = product(info_dims);
-
-  framework::Tensor tmp;
-  std::vector<int> tmpdim(1, batchsize);
-  tmp.Resize(phi::make_ddim(tmpdim));
-  int *info = tmp.mutable_data<int>(dev_ctx.GetPlace());
-
-  CholeskySolveFunctor<DeviceContext, T> functor;
-  for (int b = 0; b < batchsize; b++) {
-    auto uin_data_item = &ut_data[b * n * n];
-    auto out_data_item = &out_data[b * n * nrhs];
-    auto info_item = &info[b];
-    functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item,
-            info_item);
-  }
-
-  // calculate out's conjugate for complex
-  platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-  phi::funcs::ConjFunctor<T> out_functor(
-      out->data<T>(), out->numel(),
-      out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-  out_for_range(out_functor);
-  *out = helper.Transpose(*out);
-}
-
-template <typename DeviceContext, typename T>
-class CholeskySolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto upper = ctx.Attr<bool>("upper");
-    cholesky_solve_fn<DeviceContext, T>(ctx, *uin, *bin, out, upper);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CholeskySolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *out = ctx.Input<framework::Tensor>("Out");
-    auto *dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *du = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto upper = ctx.Attr<bool>("upper");
-
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
-
-    std::vector<int64_t> u_bst_dims_vec;
-    std::vector<int64_t> b_bst_dims_vec;
-    std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin);
-    framework::Tensor u_bst(uin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *uin, &u_bst, u_bst_dims_vec);
-
-    framework::Tensor db_bst(bin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *bin, &db_bst, b_bst_dims_vec);
-
-    if (dout) {
-      db->mutable_data<T>(dev_ctx.GetPlace());
-      cholesky_solve_fn<DeviceContext, T>(ctx, u_bst, *dout, &db_bst, upper);
-
-      if (db_bst.dims() == db->dims()) {
-        framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(db_bst, db, ctx);
-        db->Resize(bin->dims());
-      }
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-      // calculate out's conjugate for complex
-      framework::Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-      out_conj = helper.Transpose(out_conj);
-
-      framework::Tensor commonterm(out->type());
-      auto outdims = out_conj.dims();
-      auto dbdims = db_bst.dims();
-      auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false);
-      auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false);
-      auto cmtdim = outdims;
-      cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
-      commonterm.Resize(cmtdim);
-      commonterm.mutable_data<T>(dev_ctx.GetPlace());
-      blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast<T>(1),
-                  &commonterm, static_cast<T>(0));
-
-      // calculate commonterm's conjugate for complex
-      framework::Tensor commonterm_conj(commonterm.type());
-      platform::ForRange<DeviceContext> commonterm_for_range(
-          dev_ctx, commonterm.numel());
-      phi::funcs::ConjFunctor<T> commonterm_functor(
-          commonterm.data<T>(), commonterm.numel(),
-          commonterm_conj.mutable_data<T>(commonterm.dims(),
-                                          dev_ctx.GetPlace()));
-      commonterm_for_range(commonterm_functor);
-      commonterm_conj = helper.Transpose(commonterm_conj);
-
-      phi::AddRawKernel<T>(
-          static_cast<const typename paddle::framework::ConvertToPhiContext<
-              DeviceContext>::TYPE &>(dev_ctx),
-          commonterm, commonterm_conj, -1, &commonterm);
-
-      auto mat_dim_u =
-          phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false);
-
-      Tensor du_bst(uin->type());
-      // get upper or lower triangular
-      du_bst.Resize(u_bst.dims());
-      du_bst.mutable_data<T>(dev_ctx.GetPlace());
-      if (upper) {
-        blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      } else {
-        blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      }
-
-      const auto &udims = u_bst.dims();
-      const auto H = udims[udims.size() - 2];
-      const auto W = udims[udims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, u_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(du_bst.data<T>(), 0, !upper, H, W,
-                                            u_bst.data<T>());
-      x_for_range(tril_triu_computer);
-
-      du->mutable_data<T>(dev_ctx.GetPlace());
-      if (u_bst.dims() == du->dims()) {
-        framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(u_bst, du, ctx);
-        du->Resize(uin->dims());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index f1247ebdf23c8e00cdbfd662a160912a769d7558..2092f65212a6a71534e1ea9a6977abc94bf97b6a 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,9 +1,9 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn)
 
-SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
+SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
@@ -11,7 +11,7 @@ if (WITH_TESTING)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
-  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags)
   set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}")
 
   cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op)
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 7c4bdc09a569e455b20febef278003ada923dd79..0edbee534c0b5d680717250e7702f272eacd0272 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -22,11 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
@@ -50,7 +56,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) {
   auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp(
       "cinn_instruction_run", {{"X", {"x", "y"}}},
       {{"Out", {test_op_out_name}}},
-      {{"cached_index", 0}, {"instruction_index", 1}});
+      {{"cached_index", 0}, {"instruction_index", 0}});
   auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
       "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
       {{"Out", {add_op_out_name}}}, {{}});
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0a21d937aa1a70120e6112cdb291aa41eb222bb3..b76dd60409221eef9204f26319dabb20db4a36ac 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -31,6 +31,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
@@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
   // Convert the CINN runtime program to a Paddle graph
   runtime_graph_ = std::make_unique<framework::ir::Graph>(
       BuildCompiledProgram(graph, compiled_obj));
-  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
-      kMemOptVarInfoFromMainGraph,
-      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
+  auto& outer_varinfo = graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph,
+                                               &outer_varinfo);
+  // collect skip_eager_vars
+  skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size());
+  auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) {
+    // if a var exists at outer_varinfo map,
+    // that means it can be erased after graph execution
+    if (!outer_varinfo.count(var_name)) {
+      skip_eager_vars_.emplace_back(var_name);
+    }
+  };
+  std::for_each(input_var_names.begin(), input_var_names.end(),
+                add_skip_var_fn);
+  std::for_each(output_var_names.begin(), output_var_names.end(),
+                add_skip_var_fn);
+  VLOG(4) << string::Sprintf(
+      "Distribution of variables in the graph compiled:"
+      "input[%lu],internal[%lu],output[%lu],"
+      "outer_eager_deletion[%lu],skip_eager_deletion[%lu],"
+      "initialized_beforehand[%lu]",
+      input_var_names.size(), internal_var_names_.size(),
+      output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(),
+      initialized_beforehand_vars_.size());
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
   //   are set by values of the corresponding compiled tensors,
   //   including the in/out variables where the equiality between their tensors
   //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  std::unordered_set<std::string> has_refer_vars;
   for (auto&& arg : cinn_argument_names_) {
     const std::string& var_name = cinn2paddle_varmap_.at(arg);
     framework::VarDesc* var_desc = block->Var(var_name);
@@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
       auto* ori_desc = res->second;
       var_desc->SetPersistable(ori_desc->Persistable());
       var_desc->SetIsParameter(ori_desc->IsParameter());
+      has_refer_vars.insert(var_name);
     }
 
     auto cinn_tensor = GetCinnTensorOfVar(var_name);
@@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
     auto* ins = instructions.at(ins_idx).get();
     auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
     auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+    for (auto&& var_name : in_args) {
+      if (!has_refer_vars.count(var_name)) {
+        initialized_beforehand_vars_.emplace_back(var_name);
+      }
+    }
+    has_refer_vars.insert(out_args.begin(), out_args.end());
 
     auto* op_desc = block->AppendOp();
     op_desc->SetType("cinn_instruction_run");
@@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
                                                   framework::Scope* scope) {
   if (!parallel_executor_) {
     framework::details::ExecutionStrategy exec_strategy;
+    exec_strategy.num_threads_ = 1;
+    exec_strategy.use_device_ = platform::Place2DeviceType(place);
     framework::details::BuildStrategy build_strategy;
     parallel_executor_ = std::make_unique<ParallelExecutor>(
         place, scope, exec_strategy, build_strategy, runtime_graph_.get());
   }
 
   // update the scope bound to an OpHandle and rebuild temporary variables
+  VLOG(4) << "Reset scope and initialize temporary variables";
   std::unordered_map<Scope*, Scope*> scope_map = {
       {parallel_executor_->GetLocalScopes().front(), scope}};
   parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
   parallel_executor_->PrepareVariables(scope);
+  for (auto&& var_name : initialized_beforehand_vars_) {
+    auto* var = scope->GetVar(var_name);
+    auto* buffer = GetCinnBufferOfVar(var_name);
+    auto dim = framework::DDim(buffer->dims, buffer->dimensions);
+    var->GetMutable<LoDTensor>()->Resize(dim);
+    var->GetMutable<LoDTensor>()->mutable_data<float>(place);
+  }
   return parallel_executor_.get();
 }
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index a4d613ea618a886d99344a34ad80aa02e88c10e7..ed5e4383d83d23322860e3f554160013fd5532c9 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -86,6 +86,11 @@ class CinnLaunchContext {
   void CheckTensorEquivalent(const std::string& var_name,
                              const framework::LoDTensor& paddle_tensor);
 
+  // Return the name list of variables skipped eager deletion
+  const std::vector<std::string>& GetSkipEagerVars() const {
+    return skip_eager_vars_;
+  }
+
   // Return internal variable names list
   const std::unordered_set<std::string>& GetInternalVarNames() const {
     return internal_var_names_;
@@ -143,6 +148,9 @@ class CinnLaunchContext {
   std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
+  // TODO(CtfGo): remove this list after fixing batch_norm bug
+  // due to duplicate association in the same variable.
+  std::vector<std::string> initialized_beforehand_vars_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
@@ -150,6 +158,8 @@ class CinnLaunchContext {
   std::unique_ptr<framework::ir::Graph> runtime_graph_;
   // a ParallelExecutor to execute the runtime graph
   std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+  // the name list of skip_eager_vars in runtime
+  std::vector<std::string> skip_eager_vars_;
 
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index cf3b98c6679b80acad8da69c91addadb9f66ce44..5263aae03ed3f1ab6afa4eb9e6bd38f61858b397 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     details::DebugCinnCompiledResult(cinn_compiled_object);
 
     auto* launch_context = cinn_compiled_object.launch_context.get();
-    // Step 3. Prepare arguments needed for the compiled executable program.
-    launch_context->UpdateCapturedEnv(scope, place);
+    // Step 3. check the computational consistency of the subgraph
+    //         before and after the compilation
     // 3.1 Input variables: tensors of input variables have
     //     been initialized before graph compiled, just check the
     //     equiality between tensors of paddle and cinn.
@@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                                             *inputs_name2tensor.at(var_name));
     }
 
-    // 3.2 Output variables: the output variables will be initialized
-    //     and allocated buffer in callbacks which are defined in the
-    //     external_malloc/free interface of cinn_buffer_t
-    //     in their corresponding arguments.
-    // 3.3 Internal variables: A temporary scope is created in
-    //     UpdateCapturedEnv to keep the internal variables and
-    //     they are also initialized through callbacks
-
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
-    // Step 5. Launch CINN to execute the compiled executable program
-    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
-    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
+    // Step 5. use PE to execute the compiled CINN instructions
+    //         in nodes of the runtime graph
+    VLOG(4) << "Execute the runtime graph by PE";
+    framework::Scope& exec_scope = scope.NewScope();
+    auto* pe = launch_context->InitializePE(place, &exec_scope);
+    pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
     VLOG(4) << "CinnLaunchOp launch execution done.";
   }
 };
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index f5b6161ff3462cc1f12df7f59b4709bf19032df2..585f1caabed051134fd5ce7624c17b741b487ef0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <mutex>
 #include <random>
 #include <string>
+#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
@@ -25,9 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
+USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
+DECLARE_double(eager_delete_tensor_gb);
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle::operators {
 
@@ -61,6 +70,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
     CompareOpResult<float>(scope.GetVar(test_op_out_name),
                            scope.GetVar(add_op_out_name));
   };
+  FLAGS_eager_delete_tensor_gb = -1;
 
   // CPU
   run_and_check_fn(platform::CPUPlace());
diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f29bc57c9a5f4dbbfd53220ce187b386b3025e55
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+#include "paddle/fluid/framework/convert_utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::DDim out_dims = x->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint32_t send_numel = x->numel();
+    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel,
+                                             dtype, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel<float>,
+                       ops::CAllGatherOpMLUKernel<uint8_t>,
+                       ops::CAllGatherOpMLUKernel<int>,
+                       ops::CAllGatherOpMLUKernel<int8_t>,
+                       ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index c0968581acda9950aaa8ee2b8f3af15e1db59a67..7206dd01bcaa3e588cc275c2fdf25e70aacc1663 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 31b00a93f1396564907a7872e919ba6c96f666d8..0946ad8aca65e28835ea1d139fb94c309ce840a1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 7e5120cd2b392b1eb0698727ccebac485193f6d9..2c4e85400ca4adadce5db1fd318ce2273caa201f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 9c11704704ed420b14a6ccd9873e0bfbe143b4fe..61e5f27903477972ef10465ccfd6f8de8ce8fba6 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d315f211709e4f76c2d5c685721961a91c2102fe..d1e269fb5a4fe9505acf7043bc7a2cea36823ffa 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 5787090e6a52f2f37bd504a904108cd1d24caf5f..cf4d6a28744b368212fe8bcb0924001aa53b5a4e 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c79b2f92b69a1e6cc5c6f1cf17fa402c671a1997..c4e410d04da5fb5e9b6bfe4d7d5c263084889f54 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index d9a7a4abb08fc883b9b9210fcdefd56af127263a..8b498787c69db0f978acaa68ba63883270e11eb4 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index b8abf458c1c6d395fef08238abaa114ff5dc6e9e..133085ad3f3b0ffd00dbf4d026687b0311116951 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index bb78971734bf05e94f7b0ebc1f1540b254f98067..36c6f4fadd0fcc9b06c61d5c45ce6829f2d3d977 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 8f7b8c4a9040be3a2b4540c693c128e92c06a180..6e02d362156970cdee7257c7d00b70cef0519757 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index c40b2c3e76a02ce6e5e754b2dc4280d6917145e7..57e3dd53cc7748fa0fb66e7e934a1c9cd764a15f 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 55de4087f579460fa6080733f3e2f02bb082b015..059fafa3e7f4d4ff0dac7541038d62e03865529f 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 
+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(0),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in concat op should > 0. But "
-            "received inputs' length is 0."));
-    if (inputs_num == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims =
-          phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      size_t axis =
-          ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
-                      static_cast<int64_t>(inputs_dims[0].size()));
-      framework::DDim out_dims =
-          phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
-      if (out_dims[axis] < 0) {
-        out_dims[axis] = -1;
-      }
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PD_INFER_META(phi::ConcatInferMeta));
+
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                   ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
+                  ConcatInferShapeFunctor);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
                   ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 95135ba3b1a3db156cd80629296481470b11f937..cbec1182f20b886fb4a77847abf7213aec9990a5 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ops::ConjGradMaker<paddle::framework::OpDesc>,
                   ops::ConjGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 1a2df2a0c7ba34f67ecb7c2ade002fcb4475229f..0c18522fa32eae5f357da062fbd25fa92878cc08 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
     target_link_libraries(conditional_block_infer_op conditional_block_op)
 endif()
 
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
-file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n")
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc
index 55cab03ea9e3f18f36043848914ac11fac1027c9..4dcbbc8568ff18a1313171f8f66f276d77f019a1 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cc
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/bitwise_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -75,11 +75,19 @@ It operates ``%s`` on Tensor ``X`` .
   }
 };
 
-class BitwiseOp : public framework::OperatorWithKernel {
+template <typename OpComment>
+class UnaryBitwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
@@ -90,23 +98,9 @@ class BitwiseOp : public framework::OperatorWithKernel {
 };
 
 template <typename OpComment>
-class UnaryBitwiseOp : public BitwiseOp {
- public:
-  using BitwiseOp::BitwiseOp;
-
- protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-template <typename OpComment>
-class BinaryBitwiseOp : public BitwiseOp {
+class BinaryBitwiseOp : public framework::OperatorWithKernel {
  public:
-  using BitwiseOp::BitwiseOp;
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext *context) const override {
@@ -130,6 +124,14 @@ class BinaryBitwiseOp : public BitwiseOp {
     }
     context->ShareLoD("X", "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // BitwiseOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
 };
 
 }  // namespace operators
@@ -167,8 +169,3 @@ REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y");
 REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y");
 REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y");
 REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X");
-
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor);
-REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor);
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu
deleted file mode 100644
index 5d98da2c027fb6ee681bbea3980f1dbf631d6431..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/bitwise_op.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/bitwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using T = typename Functor::ELEM_TYPE;
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins = {x, y};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(cuda_ctx, ins, &outs, -1,
-                                                      functor);
-  }
-};
-
-template <typename Functor>
-class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using T = typename Functor::ELEM_TYPE;
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-namespace plat = ::paddle::platform;
-
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor);
-REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor);
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h
deleted file mode 100644
index 9e652f92007479684fcf8ec5e539312d8d729107..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/bitwise_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
-  template <typename T>                                                      \
-  struct Bitwise##func##Functor {                                            \
-    using ELEM_TYPE = T;                                                     \
-    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
-  };                                                                         \
-                                                                             \
-  template <>                                                                \
-  struct Bitwise##func##Functor<bool> {                                      \
-    using ELEM_TYPE = bool;                                                  \
-    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
-      return a bool_expr b;                                                  \
-    }                                                                        \
-  };
-
-BITWISE_BINARY_FUNCTOR(And, &, &&)
-BITWISE_BINARY_FUNCTOR(Or, |, ||)
-BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
-#undef BITWISE_BINARY_FUNCTOR
-
-template <typename T>
-struct BitwiseNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE T operator()(const T a) const { return ~a; }
-};
-
-template <>
-struct BitwiseNotFunctor<bool> {
-  using ELEM_TYPE = bool;
-  HOSTDEVICE bool operator()(const bool a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryBitwiseOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto func = Functor();
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    ElementwiseComputeEx<Functor, DeviceContext, T>(context, x, y, -1, func,
-                                                    out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryBitwiseOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto func = Functor();
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), out->mutable_data<T>(context.GetPlace()),
-          func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-namespace plat = ::paddle::platform;
-
-#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor)                 \
-  REGISTER_OP_##dev##_KERNEL(                                                 \
-      op_type,                                                                \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<bool>>,    \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<uint8_t>>, \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int8_t>>,  \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int16_t>>, \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int>>,     \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int64_t>>);
-
-#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor)                 \
-  REGISTER_OP_##dev##_KERNEL(                                                \
-      op_type,                                                               \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<bool>>,    \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<uint8_t>>, \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int8_t>>,  \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int16_t>>, \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int>>,     \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int64_t>>);
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index ede349f737d899e5f04cb5e35d1dbc0c0abc2403..dd407f4f6f3c51ef99cb09f08ef7fdca5b1e10bc 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -12,49 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_all_op.h"
-#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename Functor>
-class CompareReduceOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    Tensor tmp;
-    bool* z_data = z->mutable_data<bool>(context.GetPlace());
-
-    if (x->dims() != y->dims()) {
-      z_data[0] = false;
-    } else {
-      tmp.mutable_data<bool>(x->dims(), context.GetPlace());
-      if (x->numel() == 1 && y->numel() == 1) {
-        bool* z_data = tmp.mutable_data<bool>(context.GetPlace());
-        z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-      } else {
-        ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-            context, x, y, 0, Functor(), &tmp);
-      }
-      auto ipt = framework::EigenVector<bool>::Flatten(tmp);
-      auto out = framework::EigenScalar<bool>::From(*z);
-      auto& place =
-          *context.template device_context<platform::CPUDeviceContext>()
-               .eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      out.device(place) = ipt.all(reduce_dim);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -81,26 +46,6 @@ template <typename OpComment>
 class CompareReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE_EQ(context->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "%s operator must have input X", comment.type));
-    PADDLE_ENFORCE_EQ(context->HasInput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "%s operator must have input Y", comment.type));
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(
-        dim_x.size(), dim_y.size(),
-        platform::errors::InvalidArgument(
-            "The size of dim_y should not be greater than dim_x's."));
-
-    context->SetOutputDim("Out", {1});
-    context->ShareLoD("X", "Out");
-  }
 };
 
 }  // namespace operators
@@ -113,25 +58,13 @@ class CompareReduceOp : public framework::OperatorWithKernel {
   };                                                                       \
   char _##op_type##Comment::type[]{#op_type};                              \
   char _##op_type##Comment::equation[]{_equation};                         \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
+                              PD_INFER_META(phi::CompareAllInferMeta));    \
   REGISTER_OPERATOR(                                                       \
       op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>,  \
       ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,    \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,   \
+      op_type##_InferShapeFunctor);
 
-#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor)             \
-  REGISTER_OP_CPU_KERNEL(                                                \
-      op_type, ::paddle::operators::CompareReduceOpKernel<               \
-                   ::paddle::platform::CPUDeviceContext, functor<bool>>, \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<int>>,           \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<float>>,         \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<double>>);
 REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y");
-
-REGISTER_COMPARE_REDUCE_CPU_KERNEL(equal_all,
-                                   paddle::operators::EqualReduceFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
deleted file mode 100644
index d96dcebe51f97f1a3a954966aeb3663ff1f7a819..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/fill.h>
-#include "paddle/fluid/operators/controlflow/compare_all_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct BitwiseAdd {
-  // Bitwise add operator, returns <tt>a + b</tt>
-  inline T initial() { return static_cast<T>(true); }
-
-  __host__ __device__ __forceinline__ T operator()(const T& a,
-                                                   const T& b) const {
-    return a & b;
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class CompareReduceOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    bool* z_data = z->mutable_data<bool>(context.GetPlace());
-    Tensor tmp;
-
-    if (x->dims() != y->dims()) {
-      thrust::device_ptr<bool> z_dev_ptr(z_data);
-      thrust::fill(z_dev_ptr, z_dev_ptr + 1, false);
-      return;
-    } else {
-      tmp.mutable_data<bool>(x->dims(), context.GetPlace());
-      const auto& cuda_ctx =
-          context.template device_context<platform::CUDADeviceContext>();
-      std::vector<const framework::Tensor*> ins = {x, y};
-      std::vector<framework::Tensor*> outs = {&tmp};
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
-          cuda_ctx, ins, &outs, Functor());
-
-      // Reduce by 'bitwise and' operator
-      std::vector<int> reduce_dims;
-      reduce_dims.resize(tmp.dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-      auto stream = context.cuda_device_context().stream();
-      TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
-          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
-          reduce_dims, stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor)                  \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_type,                                                                 \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext, ops::functor<bool>>, \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext, ops::functor<int>>,  \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<int64_t>>,                       \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<float>>,                         \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<double>>);
-
-REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor)
-#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h
deleted file mode 100644
index 78a7b76e3fd9d03f2381dfb13f90c191d1dca4f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_all_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <algorithm>
-#include <type_traits>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EqualReduceFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const {
-    if (std::is_floating_point<T>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return fabs(static_cast<double>(a - b)) < 1e-8;
-    } else {
-      return (a == b);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 657e74398bb24bb4c2a5514bbb1656126591ee4e..72d81d8c3fdf2827da9b8362cee80ecbb16e4484 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include <algorithm>
-#include <string>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -60,31 +58,6 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-
-    if (context->GetInputDim("X") == context->GetInputDim("Y")) {
-      context->ShareDim("X", /*->*/ "Out");
-      context->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int max_dim = std::max(dim_x.size(), dim_y.size());
-      int axis = std::abs(dim_x.size() - dim_y.size());
-      std::vector<int> x_dims_array(max_dim);
-      std::vector<int> y_dims_array(max_dim);
-      std::vector<int> out_dims_array(max_dim);
-      GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
-                             y_dims_array.data(), out_dims_array.data(),
-                             max_dim, axis);
-      context->SetOutputDim("Out", phi::make_ddim(out_dims_array));
-      // to do
-      context->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
@@ -116,37 +89,31 @@ class CompareOp : public framework::OperatorWithKernel {
               "In order to force fill output variable to gpu memory.",     \
               false));
 
-#define REGISTER_COMPARE_OP(op_type, _equation)                           \
-  struct _##op_type##Comment {                                            \
-    static char type[];                                                   \
-    static char equation[];                                               \
-  };                                                                      \
-  char _##op_type##Comment::type[]{#op_type};                             \
-  char _##op_type##Comment::equation[]{_equation};                        \
-  REGISTER_OPERATOR(                                                      \
-      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,       \
-      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,      \
-      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); \
+#define REGISTER_COMPARE_OP(op_type, _equation)                          \
+  struct _##op_type##Comment {                                           \
+    static char type[];                                                  \
+    static char equation[];                                              \
+  };                                                                     \
+  char _##op_type##Comment::type[]{#op_type};                            \
+  char _##op_type##Comment::equation[]{_equation};                       \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
+                              PD_INFER_META(phi::CompareInferMeta));     \
+  REGISTER_OPERATOR(                                                     \
+      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,      \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,     \
+      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,  \
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, \
+      op_type##_InferShapeFunctor);                                      \
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterThanFunctor);
+
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterEqualFunctor);
+
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
-REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessThanFunctor);
+
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
-REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessEqualFunctor);
+
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
-                        paddle::operators::EqualFunctor);
+
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
-                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
deleted file mode 100644
index 4b9452d0f60e0396e4bc50bb5ea56e2f3131098e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor, typename InverseFunctor>
-class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  using InT = typename Functor::ELEM_TYPE;
-  using OutT = bool;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                   InT, OutT>(
-        cuda_ctx, ins, &outs, axis, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_type,                                                                 \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<bool>, void>,    \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int16_t>, void>, \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int>, void>,     \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int64_t>, void>, \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<float>, void>,   \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<double>, void>);
-
-REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor)
-#undef REGISTER_CUDA_COMPARE_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
deleted file mode 100644
index be017a01ef3237fd8572e248d691daa97c999509..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define COMPARE_FUNCTOR(func_name, op)                           \
-  template <typename InT, typename OutT = bool>                  \
-  struct func_name {                                             \
-    using ELEM_TYPE = InT;                                       \
-    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
-      return static_cast<OutT>(a op b);                          \
-    }                                                            \
-  };
-
-COMPARE_FUNCTOR(LessThanFunctor, <)
-COMPARE_FUNCTOR(LessEqualFunctor, <=)
-COMPARE_FUNCTOR(GreaterThanFunctor, >)
-COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
-#undef COMPARE_FUNCTOR
-
-template <typename InT, typename OutT = bool>
-struct EqualFunctor {
-  using ELEM_TYPE = InT;
-  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
-    if (std::is_floating_point<InT>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
-    } else {
-      return static_cast<OutT>(a == b);
-    }
-  }
-};
-
-template <typename InT, typename OutT = bool>
-struct NotEqualFunctor {
-  using ELEM_TYPE = InT;
-  HOSTDEVICE bool operator()(const InT a, const InT b) const {
-    return !EqualFunctor<InT, OutT>()(a, b);
-  }
-};
-
-template <typename DeviceContext, typename Functor, typename InverseFunctor>
-class CompareOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                            Functor(), z);
-    } else {
-      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
-          context, x, y, axis, InverseFunctor(), z);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
-  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<bool>, inverse_functor<bool>>,       \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int>, inverse_functor<int>>,         \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int16_t>, inverse_functor<int16_t>>, \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int64_t>, inverse_functor<int64_t>>, \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<float>, inverse_functor<float>>,     \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dc287ab76a67c6026ec8794793e77179063af3d
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class EqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_EQ, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NotEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_NE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LT, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterThanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GT, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(
+    equal, ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    not_equal, ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    less_than, ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    less_equal, ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    greater_than,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    greater_equal,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
index 7bc4ca09771355361d8106421dc57601b94c88f1..7377d7cf8d312c4f4f405235b21b372b1a7a738c 100644
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
index 698bd0516133861523f8d2b353abfeace4665840..2de8b4c9ba880e089bb4eaa4fa8df3bedb69b55b 100644
--- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index a4262d405435ae31c2a5ad681ab443889ec5d393..4d11cb5ff74e69e991271d2a566dbc9344d35da2 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/logical_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
       ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
-                               paddle::operators::LogicalAndFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
-                               paddle::operators::LogicalOrFunctor);
 REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
-                              paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
                            "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
deleted file mode 100644
index d88658607ed275808d64dddf4a60d52d4f995e73..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/logical_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using InT = typename Functor::ELEMENT_TYPE;
-    using OutT = bool;
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-
-    if (ins.size() == 1) {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_name,                                                                 \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>,    \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>,  \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>,     \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
-
-REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
-#undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
deleted file mode 100644
index 15cd643a858cc018e3007fa90ec479900cd243be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
-  template <typename T>                                      \
-  struct func_name {                                         \
-    using ELEMENT_TYPE = T;                                  \
-    HOSTDEVICE bool operator()(const T a, const T b) const { \
-      return static_cast<bool>(a) op static_cast<bool>(b);   \
-    }                                                        \
-  };
-
-LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
-LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
-LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
-#undef LOGICAL_BINARY_FUNCTOR
-
-template <typename T>
-struct LogicalNotFunctor {
-  using ELEMENT_TYPE = T;
-  HOSTDEVICE bool operator()(const T a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
-                                                          binary_func, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor unary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(),
-          out->mutable_data<bool>(context.GetPlace()), unary_func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor)              \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<                 \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
-
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor)               \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<                  \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index 02f95254035d6041ef64dd746faa924abb053165..c3d7df8d0274371a4c5a482624c75b36677778a9 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 3bbb284ca821b8576f2752446555f146c16bb189..4e6fda3d09a071f59c97c87315619d126497a756 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                                const Tensor* input, Tensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *context.eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
   auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
 
 using framework::ConvSearchCache;
 
-static void SetConvMathType(const framework::ExecutionContext& ctx,
-                            cudnnDataType_t dtype,
+static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
                             const platform::ConvolutionDescriptor& cdesc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto& dev_ctx = ctx;
   if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
     bool has_got_workspace_size = true;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     } else if (deterministic) {
       algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     } else if (deterministic) {
       return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     platform::CUDAGraphCaptureModeGuard guard;
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     } else if (deterministic) {
       return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
       AlgorithmsCache<algo_t>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetBackwardFilter());
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
deleted file mode 100644
index dff60afd74c02f458b5b3c7428c2703197b61af0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ /dev/null
@@ -1,1476 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the spopecific language governing permissions and
-limitations under the License. */
-
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/operators/conv_miopen_helper.h"
-#else
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-
-static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) {
-  return dev_ctx.GetComputeCapability() >= 70;
-}
-
-template <typename T>
-class CUDNNConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    // Tensor Core introduced from Volta GPUs supports more faster conv op
-    // with FP16 in NHWC data format.
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    // We will only do data format conversion from NHWC to NCHW.
-    // cudnn will convert NCHW to NHWC automatically on Tensor Core.
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-
-    // ------------ transformed tensor -----------
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_filter_channel(filter->type());
-    T* output_data = nullptr;
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, output,
-                                                           &transformed_output);
-
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output.ShareDataWith(*output);
-    }
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-    }
-    output_data = transformed_output.data<T>();
-
-    // update padding and dilation
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
-
-    Tensor transformed_input;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-
-      std::vector<int> input_pad(transformed_input_channel.dims().size() * 2,
-                                 0);
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_input,
-                  &transformed_filter_channel,
-                  &transformed_output,
-                  strides,
-                  padding_common,
-                  dilations,
-                  dtype};
-
-    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_format = GetCudnnTensorFormat(layout);
-
-    args.handle = handle;
-
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN need to set groups in cdesc in miopen_desc.h
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn(), groups);
-#else
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn());
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it manually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
-        args.cdesc.desc(), groups));
-    groups = 1;
-#endif
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN do not set groups in wdesc after set groups in cdesc
-    groups = 1;
-#endif
-    args.idesc.set(transformed_input, layout_format);
-    args.wdesc.set(transformed_filter_channel, layout_format, groups);
-    args.odesc.set(transformed_output, layout_format);
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    }
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size = 0;  // final workspace to allocate.
-// ------------------- cudnn conv algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t algo{};
-    using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-    workspace_size = search::GetWorkspaceSize(args);
-    algo = search::Find<T>(args, exhaustive_search, deterministic,
-                           workspace_size, ctx);
-#else
-    cudnnConvolutionFwdAlgo_t algo{};
-    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-    workspace_size = search::GetWorkspaceSize(args, algo);
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
-    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
-    // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
-    // FWD_ALGO_IMPLICIT_GEMM manually.
-    if (ctx.Attr<int>("groups") > 1) {
-      algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    }
-#endif
-
-    // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
-
-#ifdef PADDLE_WITH_HIP
-    workspace_handle.RunFunc(
-        [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionForward(
-                  handle, &alpha, args.idesc.desc(), input_data,
-                  args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
-                  &beta, args.odesc.desc(), output_data, workspace_ptr,
-                  workspace_size));
-        },
-        workspace_size);
-#else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::cudnnConvolutionForward(
-                    handle, &alpha, args.idesc.desc(),
-                    input_data + i * group_offset_in, args.wdesc.desc(),
-                    filter_data + i * group_offset_filter, args.cdesc.desc(),
-                    algo, workspace_ptr, workspace_size, &beta,
-                    args.odesc.desc(), output_data + i * group_offset_out));
-          },
-          workspace_size);
-    }
-#endif
-
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_output, output);
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvGradOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-
-    // transform Tensor
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output_grad_channel(output_grad->type());
-    Tensor transformed_input_grad_channel(input->type());
-    Tensor transformed_filter_channel(filter->type());
-    Tensor transformed_filter_grad_channel(filter->type());
-
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
-                 "NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-
-      if (input_grad) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, input_grad, &transformed_input_grad_channel);
-        // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
-        // the data of input_grad to transformed_input_grad_channel.
-        if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-          TransToChannelFirst<platform::CUDADeviceContext, T>(
-              ctx, input_grad, &transformed_input_grad_channel);
-        }
-      }
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output_grad_channel.ShareDataWith(*output_grad);
-      if (input_grad) {
-        transformed_input_grad_channel.ShareDataWith(*input_grad);
-      }
-    }
-
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-
-      if (filter_grad) {
-        ResizeToChannelLast<platform::CUDADeviceContext, T>(
-            ctx, filter_grad, &transformed_filter_grad_channel);
-      }
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-      if (filter_grad) {
-        transformed_filter_grad_channel.ShareDataWith(*filter_grad);
-      }
-    }
-
-    //  update paddings
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // cuDNN only supports padding the same amount on every dimension.
-    // So we create a new padded input tensor.
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_input(input->type());
-    Tensor transformed_input_grad(input->type());
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-
-      transformed_input_grad.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (input_grad) {
-        transformed_input_grad =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      // pad for input
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (input_grad) {
-        transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
-      }
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    const T* output_grad_data = transformed_output_grad_channel.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-    T* filter_grad_data = nullptr;
-    T* input_grad_data = nullptr;
-    T* transformed_input_grad_data = nullptr;
-
-    ConvArgs args1{&transformed_input_grad,
-                   &transformed_filter_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_input,
-                   &transformed_filter_grad_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-
-    auto handle = dev_ctx.cudnn_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    }
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-// ------------------- cudnn backward algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-    // input data workspace_size
-    size_t workspace_size_d = 0;
-    // weight workspace_size
-    size_t workspace_size_w = 0;
-    int iwo_groups = groups;
-    int c_groups = 1;
-
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    if (input_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      input_grad_data = input_grad->data<T>();
-      transformed_input_grad_data = transformed_input_grad.data<T>();
-      args1.handle = handle;
-      args1.idesc.set(transformed_input_grad, layout_tensor);
-      args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
-      args1.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args1.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-
-#ifdef PADDLE_WITH_HIP
-      using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size_d =
-          std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
-      data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
-                                   workspace_size_d, ctx);
-#else
-      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-      workspace_size_d = std::max(workspace_size_d,
-                                  search1::GetWorkspaceSize(args1, data_algo));
-#endif
-    }
-
-    if (filter_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      filter_grad_data = transformed_filter_grad_channel.data<T>();
-      args2.handle = handle;
-      args2.idesc.set(transformed_input, layout_tensor);
-      args2.wdesc.set(transformed_filter_grad_channel, layout_tensor,
-                      iwo_groups);
-      args2.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args2.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size_w =
-          std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
-      filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
-                                     workspace_size_w, ctx);
-#else
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
-      workspace_size_w = std::max(
-          workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
-#endif
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN ONLY support beta to be 0.0f
-    ScalingParamType<T> beta = 0.0f;
-#else
-    ScalingParamType<T> beta =
-        (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
-#endif
-    VLOG(4) << "Conv_grad: use_addto = "
-            << (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
-
-    if (input_grad) {
-// When beta is 0, it is unnecessary to reset input_grad.
-// When beta is 1, the output cannot be reset since addt strategy used.
-#ifdef PADDLE_WITH_HIP
-      if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-        Tensor temp_tensor(transformed_input_grad.type());
-        temp_tensor.Resize(transformed_input_grad.dims());
-        T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
-                      cudnn_workspace_ptr, workspace_size_d));
-            },
-            workspace_size_d);
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
-            handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
-            transformed_input_grad_data, &alpha, args1.idesc.desc(),
-            temp_tensor_data, &beta, args1.idesc.desc(),
-            transformed_input_grad_data));
-      } else {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data, cudnn_workspace_ptr,
-                      workspace_size_d));
-            },
-            workspace_size_d);
-      }
-
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args1.wdesc.desc(),
-                      filter_data + i * group_offset_filter, args1.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
-                      workspace_size_d, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data + i * group_offset_in));
-            },
-            workspace_size_d);
-      }
-#endif
-      if (!is_sys_pad) {
-        std::vector<int> starts(transformed_input_channel.dims().size(), 0);
-        std::vector<int> axes(transformed_input_channel.dims().size(), 0);
-
-        for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-
-        transformed_input_grad_channel.mutable_data(ctx.GetPlace());
-        if (transformed_input_channel.dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        }
-      }
-
-      if (channel_last && compute_format == DataLayout::kNCHW) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_input_grad_channel, input_grad);
-      }
-    }
-
-    // filter_grad do not use inplace addto.
-    ScalingParamType<T> beta_filter = 0.0f;
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-// Because beta is zero, it is unnecessary to reset filter_grad.
-#ifdef PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args2.odesc.desc(), output_grad_data,
-                    args2.idesc.desc(), input_data, args2.cdesc.desc(),
-                    filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
-                    cudnn_workspace_ptr, workspace_size_w));
-          },
-          workspace_size_w);
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args2.idesc.desc(),
-                      input_data + i * group_offset_in, args2.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size_w, &beta_filter, args2.wdesc.desc(),
-                      filter_grad_data + i * group_offset_filter));
-            },
-            workspace_size_w);
-      }
-#endif
-
-      if (compute_format == DataLayout::kNHWC) {
-        TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_filter_grad_channel, filter_grad);
-      }
-    }
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv(ddI, W) + conv(I, ddW)
- * dW = conv_bp_filter(ddI, dO)
- * dI = conv_bp_data(ddW, dO)
- */
-template <typename T>
-class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-    if (ddO) {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-      set_zero(dev_ctx, ddO, static_cast<T>(0));
-    }
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // const T* x = X->data<T>();
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-    T* transformed_dx = nullptr;
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensors to channel first-----------
-    Tensor transformed_X_channel(X->type());
-    Tensor transformed_dO_channel(dO->type());
-    Tensor transformed_ddX_channel(X->type());
-
-    Tensor transformed_ddO_channel(dO->type());
-    Tensor transformed_dX_channel(X->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-
-      if (ddX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-        TransToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-      }
-
-      if (ddO) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddO, &transformed_ddO_channel);
-      }
-      if (dX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, dX, &transformed_dX_channel);
-        transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
-      }
-
-    } else {
-      transformed_X_channel = *X;
-      transformed_dO_channel = *dO;
-      if (ddX) {
-        transformed_ddX_channel = *ddX;
-      }
-      if (ddO) {
-        transformed_ddO_channel.ShareDataWith(*ddO);
-      }
-      if (dX) {
-        transformed_dX_channel.ShareDataWith(*dX);
-      }
-    }
-
-    auto in_dims = transformed_X_channel.dims();
-    auto filter_dims = W->dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_X(X->type());
-    Tensor transformed_ddX(X->type());
-
-    Tensor transformed_dX(X->type());
-
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(X->dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_X_channel.dims()[0];
-      new_input_shape_vec[1] = transformed_X_channel.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            transformed_X_channel.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_X.Resize(new_input_shape);
-      transformed_ddX.Resize(new_input_shape);
-      transformed_dX.Resize(new_input_shape);
-
-      transformed_X =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (ddX) {
-        transformed_ddX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      if (dX) {
-        transformed_dX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-
-      // pad for input
-      const int rank = X->dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
-          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
-          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_X.ShareDataWith(transformed_X_channel);
-      if (ddX) {
-        transformed_ddX.ShareDataWith(transformed_ddX_channel);
-      }
-      if (dX) {
-        transformed_dX.ShareDataWith(transformed_dX_channel);
-      }
-
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* x = transformed_X.data<T>();
-
-    int iwo_group = groups;
-    int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-    groups = 1;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    ConvArgs args1{&transformed_ddX,
-                   W,
-                   &transformed_ddO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{
-        &transformed_X, ddW,  &transformed_ddO_channel, strides, padding_common,
-        dilations,      dtype};
-    ConvArgs args3{&transformed_ddX,
-                   dW,
-                   &transformed_dO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args4{
-        &transformed_dX, ddW,  &transformed_dO_channel, strides, padding_common,
-        dilations,       dtype};
-
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t fwd_algo1 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvFwdAlgorithm_t fwd_algo2 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionFwdAlgo_t fwd_algo1 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t fwd_algo2 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-
-    auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
-
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-
-    T* transformed_ddy_channel = nullptr;
-    if (ddO) {
-      ddy = ddO->data<T>();
-      transformed_ddy_channel = transformed_ddO_channel.data<T>();
-      if (ddX) {
-        args1.handle = handle;
-        args1.idesc.set(transformed_ddX, iwo_group);
-        args1.wdesc.set(*W, layout, iwo_group);
-        args1.odesc.set(transformed_ddO_channel, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-        using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size = search1::GetWorkspaceSize(args1);
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
-        workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
-#endif
-      }
-
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(transformed_X, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(transformed_ddO_channel, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-        using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size =
-            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, fwd_algo2));
-#endif
-      }
-    }
-
-    if (dW && ddX) {
-      dw = dW->data<T>();
-      args3.handle = handle;
-      args3.idesc.set(transformed_ddX, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-      args3.odesc.set(transformed_dO_channel, iwo_group);
-      args3.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
-                                     workspace_size, ctx);
-#else
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-#endif
-    }
-
-    if (ddW && dX) {
-      transformed_dx = transformed_dX.data<T>();
-
-      args4.handle = handle;
-      args4.idesc.set(transformed_dX, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(transformed_dO_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-      using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
-                                   workspace_size, ctx);
-#else
-      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-#endif
-    }
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h,
-             &i_w);
-
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-             &o_h, &o_w);
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = W->numel() / groups;
-
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
-    // 0.0f;
-    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    if (ddO) {
-      if (ddX) {
-        ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args1.idesc.desc(), ddx,
-                      args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
-                      &beta, args1.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args1.idesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        fwd_algo1, workspace_ptr, workspace_size, &beta,
-                        args1.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (ddW) {
-#ifdef PADDLE_WITH_HIP
-        // MIOPEN ONLY support beta to be 0.0f
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
-                      ddw, args2.cdesc.desc(), fwd_algo2, &beta,
-                      args2.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args2.idesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        fwd_algo2, workspace_ptr, workspace_size, &alpha,
-                        args2.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_ddO_channel, ddO);
-      }
-    }
-    T* transformed_dy_channel = transformed_dO_channel.data<T>();
-    if (dW && ddX) {
-      ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
-                    args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
-                    &beta, args3.wdesc.desc(), dw, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args3.idesc.desc(),
-                      ddx + i * group_offset_in, args3.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.cdesc.desc(), filter_algo, workspace_ptr,
-                      workspace_size, &beta, args3.wdesc.desc(),
-                      dw + i * group_offset_filter));
-            },
-            workspace_size);
-      }
-#endif
-    }
-
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardData(
-                    handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
-                    args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
-                    &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args4.wdesc.desc(),
-                      ddw + i * group_offset_filter, args4.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.cdesc.desc(), data_algo, workspace_ptr,
-                      workspace_size, &beta, args4.idesc.desc(),
-                      transformed_dx + i * group_offset_in));
-            },
-            workspace_size);
-      }
-#endif
-
-      if (!is_sys_pad) {
-        // reverse padded input
-        std::vector<int> starts(X->dims().size(), 0);
-        std::vector<int> axes(X->dims().size(), 0);
-
-        for (size_t i = 0; i < X->dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-        if (X->dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_dX_channel, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
-// Use depthwise_conv2d in MIOPEN to resolve this issue
-REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#else
-#if CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-#else
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 9c9795143eb78dc5c1b22ec792d8753f915c976e..66f718693847837a4d169a5cab9629a1f668244f 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                                const Tensor* input, Tensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *context.eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
   auto offsets = Eigen::array<int, D>();
@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index e345a4d2603b630508e299207984f4708217a1d8..8213e877f722433488cd826bb63cba376972c57a 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
             paddle::framework::DataTypeToString(input_data_type),
             paddle::framework::DataTypeToString(filter_data_type)));
   }
-#ifndef PADDLE_WITH_ASCEND_CL
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(
-        library, framework::LibraryType::kCUDNN,
-        platform::errors::InvalidArgument(
-            "float16 can only be used when CUDNN or NPU is used"));
-  }
-#endif
+// #ifndef PADDLE_WITH_ASCEND_CL
+//   if (input_data_type == framework::proto::VarType::FP16) {
+//     PADDLE_ENFORCE_EQ(
+//         library, framework::LibraryType::kCUDNN,
+//         platform::errors::InvalidArgument(
+//             "float16 can only be used when CUDNN or NPU is used"));
+//   }
+// #endif
 #if PADDLE_WITH_CUDA
   if (input_data_type == framework::proto::VarType::BF16 &&
       library == framework::LibraryType::kCUDNN) {
@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
                   ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
 
-// depthwise conv kernel
-// TODO(xingzhaolong): neon kernel for mobile
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
 REGISTER_OP_VERSION(conv2d)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc
deleted file mode 100644
index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 26166362da8a2984dc3c0670b186b85800767fb7..a5d888765bf37d45d501a3dbe5437f7c2ab5fc51 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename DeviceContext, typename T>
-class GemmConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    const int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output(output->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-
-    // update padding and dilation
-    auto trans_in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims =
-        phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-    // filter_shape_vec:
-    // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-
-    // output_shape_vec:
-    // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output.dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec:
-    // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
-    // o_d,o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = trans_in_dims[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size:
-    // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
-    // o_w)
-
-    framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim);
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    framework::DDim in_matrix_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        transformed_output.dims()[1],
-        transformed_output.numel() /
-            (transformed_output.dims()[0] * transformed_output.dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
-
-    math::Vol2ColFunctor<DeviceContext, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch =
-          transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
-      Tensor out_batch =
-          transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx, in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &col);
-
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
-                    T(0.0));
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output_grad(output_grad->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-
-    // update padding and dilation
-    auto in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output_grad.dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = transformed_input.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w)
-    // or
-    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    framework::DDim input_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        transformed_output_grad.dims()[1],
-        transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
-                                           transformed_output_grad.dims()[1])};
-
-    // convolution backward input operator:  gemm + col2im(or col2vol)
-    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary,
-      // because math::matmul will reset input_grad.
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col_matrix.ShareDataWith(in_grad_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
-                      &col_matrix, T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &in_grad_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col.ShareDataWith(in_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, in_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
-                      &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
-    const Tensor* X = ctx.Input<Tensor>("Input");
-    const Tensor* dY = ctx.Input<Tensor>("DOutput");
-    const Tensor* ddX = ctx.Input<Tensor>("DDInput");
-    const Tensor* ddW_in = ctx.Input<Tensor>("DDFilter");
-
-    Tensor* ddY = ctx.Output<Tensor>("DDOutput");
-    Tensor* dW = ctx.Output<Tensor>("DFilter");
-    Tensor* dX = ctx.Output<Tensor>("DInput");
-    Tensor W = GET_DATA_SAFELY(ctx.Input<Tensor>("Filter"), "Input", "Filter",
-                               "GemmConvDoubleGrad");
-    if (!ddY && !dW && !dX) return;
-
-    const int groups = ctx.Attr<int>("groups");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensor
-    Tensor transformed_X(X->dtype());
-    Tensor transformed_dY(dY->dtype());
-    Tensor transformed_ddX(X->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-      TransToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-
-      ResizeToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-      TransToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-
-      if (ddX) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-        TransToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-      }
-    } else {
-      transformed_X = *X;
-      transformed_dY = *dY;
-      if (ddX) {
-        transformed_ddX = *ddX;
-      }
-    }
-
-    // update padding and dilation
-    auto in_dims = transformed_X.dims();
-    auto filter_dims = W.dims();
-
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(W.dims()));
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_dY.dims()));
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    // col_shape [in_channel/group, kh, kw, oh, ow]
-    col_shape_vec[0] = transformed_X.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-    // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-    // input_shape [Cin, H, W]
-    framework::DDim input_shape =
-        phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
-    // filter_matrix_shape [Cout, Cin * kh * kw]
-    framework::DDim filter_matrix_shape = {W.dims()[0],
-                                           W.numel() / W.dims()[0]};
-
-    W.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        transformed_dY.dims()[1],
-        transformed_dY.numel() /
-            (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
-    int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    // dx convolution double grad:  gemm + col2im(col2vol)
-    // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-    // oH, oW)
-    if (dX && ddW_in) {
-      Tensor ddW;
-      ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-      dX->mutable_data<T>(ctx.GetPlace());
-
-      Tensor transformed_dX(dX->dtype());
-
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, dX, &transformed_dX);
-
-      } else {
-        transformed_dX = *dX;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary
-      // because math::matmul will reset dx
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col_matrix.ShareDataWith(dx_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix,
-                      T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &dx_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_dX, dX);
-      }
-    }
-
-    // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-    // oH, oW)
-    // dw convolution double grad:  im2col(vol2col) + gemm
-    if (dW && ddX) {
-      dW->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dW, static_cast<T>(0));
-      Tensor dW_arr = *dW;
-      dW_arr.Resize(filter_matrix_shape);
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; ++g) {
-          // im2col
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, ddx_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-
-          Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice,
-                      T(1.0));
-        }
-      }
-    }
-
-    // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-    // w/ddw(Cout, Cin, kh, kw)
-    // ddy convolution double grad: im2col(vol2col) + gemm
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-
-      Tensor transformed_ddY(ddY->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddY, &transformed_ddY);
-      } else {
-        transformed_ddY = *ddY;
-      }
-
-      set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor ddy_batch =
-            transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
-        for (int g = 0; g < groups; ++g) {
-          // gemm
-          Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-
-          if (ddX) {
-            Tensor ddx_batch =
-                transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-            Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-            if (!is_expand) {
-              col.ShareDataWith(ddx_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, ddx_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-            }
-            Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(0.0));
-          }
-
-          if (ddW_in) {
-            Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape);
-            Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-
-            Tensor ddW;
-            ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-            if (!is_expand) {
-              col.ShareDataWith(x_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, x_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-            }
-
-            // gemm
-            Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(1.0));
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_ddY, ddY);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    if (channel_last) {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[output->dims().size() - 1] %
-              input->dims()[input->dims().size() - 1],
-          0, platform::errors::InvalidArgument(
-                 "ShapeError: The output channels must be a multiple of the "
-                 "input channels. But receivced output channel number is %d "
-                 "and input channel number is %d",
-                 output->dims()[output->dims().size() - 1],
-                 input->dims()[input->dims().size() - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[1] % input->dims()[1], 0,
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be a multiple of the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[1], input->dims()[1]));
-    }
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (fuse_relu) {
-      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    } else {
-      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-      if (fuse_relu) {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      } else {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      if (fuse_relu) {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      } else {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 4b8f9d7e6ca8d2f1dae99f1d034c53daf948f922..1841b78af32dd95d6884d5eb78ad30322ba7723e 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
     Tensor transformed_input;
@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
         } break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
@@ -242,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-    algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic, workspace_size,
+        ctx.template device_context<platform::CUDADeviceContext>());
 #else
     using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(args, false, deterministic, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic,
+        ctx.template device_context<platform::CUDADeviceContext>());
     workspace_size =
         std::max(workspace_size, search::GetWorkspaceSize(args, algo));
 #endif
@@ -375,7 +381,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
     Tensor transformed_output_grad;
@@ -407,13 +413,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
               &transformed_output_grad);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
               &transformed_output_grad);
         } break;
         default:
@@ -499,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1));
-      data_algo =
-          search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(args1, false, deterministic, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
 #endif
@@ -521,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      filter_algo =
-          search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size = std::max(workspace_size,
                                 search2::GetWorkspaceSize(args2, filter_algo));
 #endif
@@ -735,7 +747,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
     Tensor transformed_X(X->type());
     Tensor transformed_ddX(X->type());
 
@@ -794,26 +806,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (dO) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_dO_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_dO_channel, pad_value,
                 &transformed_dO);
           }
 
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
@@ -940,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size = search1::GetWorkspaceSize(args1);
-        bwd_algo1 =
-            search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
         using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
         workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
 #endif
       }
@@ -961,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size =
             std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        bwd_algo2 =
-            search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
         using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
         workspace_size = std::max(workspace_size,
                                   search2::GetWorkspaceSize(args2, bwd_algo2));
 #endif
@@ -986,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo =
-          search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size = std::max(workspace_size,
                                 search3::GetWorkspaceSize(args3, filter_algo));
 #endif
@@ -1009,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo =
-          search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search4::Find<T>(args4, false, deterministic, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
 #endif
diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu
index b2a4910222f1178d23e94eade9580248bb103c88..054cb4b33895b02a816cc2bff82b1c9052bc645d 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu
+++ b/paddle/fluid/operators/conv_transpose_op.cu
@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
 
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(
+        groups, filter.dims()[0],
+        platform::errors::InvalidArgument(
+            "groups should be error to the 1st dimension of filter. But "
+            "received groups is %d and filter dimension[0] is %d",
+            groups, filter.dims()[0]));
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
+                                  "dilations should be 1 in depthwise conv. "
+                                  "But received dilations is %d",
+                                  v));
+    }
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *output, filter, *input, strides,
+        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+        dilations, output, data_layout);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
+      depthwiseConv(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, filter, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, input_grad, data_layout);
+    }
+
+    if (filter_grad) {
+      phi::funcs::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, *input, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, filter_grad, data_layout);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 // conv2d
 REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
                         ops::GemmConvTransposeKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 76d6ad6bf2ff7361a90fb6f013f989db5a2b8845..ee0fb7ab3683364f6db3cffd7ddef67c61f19433 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(
-        groups, filter.dims()[0],
-        platform::errors::InvalidArgument(
-            "groups should be error to the 1st dimension of filter. But "
-            "received groups is %d and filter dimension[0] is %d",
-            groups, filter.dims()[0]));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
-                                  "dilations should be 1 in depthwise conv. "
-                                  "But received dilations is %d",
-                                  v));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(
-        dev_ctx, *output, filter, *input, strides,
-        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-        dilations, output, data_layout);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    if (input_grad) {
-      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(
-          dev_ctx, *output_grad, filter, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, input_grad, data_layout);
-    }
-
-    if (filter_grad) {
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-
-      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(
-          dev_ctx, *output_grad, *input, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, filter_grad, data_layout);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index fe00ee06603f0ecf2e3fa6ac367303a70702508f..674b75625d1983ba97f3d47ee154beff79c42dad 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
-                            PT_INFER_META(phi::CrossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
+                            PD_INFER_META(phi::CrossInferMeta));
 REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker,
                   ops::CrossGradMaker<paddle::framework::OpDesc>,
                   ops::CrossGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
deleted file mode 100644
index ab3860ecafc3569c13b0b9e5c882df9ddc03e190..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cum_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <array>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename Functor>
-class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = GET_DATA_SAFELY(context.Input<framework::Tensor>("X"), "Input",
-                              "X", "Cum");
-
-    auto& Out = GET_DATA_SAFELY(context.Output<framework::Tensor>("Out"),
-                                "Output", "Out", "Cum");
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto out_dims = Out.dims();
-
-    PADDLE_ENFORCE_EQ(
-        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(axis) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-            out_dims.size(), out_dims.size() - 1, axis));
-    if (axis < 0) {
-      axis += out_dims.size();
-    }
-
-    Out.template mutable_data<T>(context.GetPlace());
-
-    int pre = 1;
-    int post = 1;
-    int mid = out_dims[axis];
-    for (int i = 0; i < axis; ++i) {
-      pre *= out_dims[i];
-    }
-    for (int i = axis + 1; i < out_dims.size(); ++i) {
-      post *= out_dims[i];
-    }
-
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    using IndexT = Eigen::DenseIndex;
-    if (pre == 1) {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      }
-    } else {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      }
-    }
-  }
-
- private:
-  template <typename Device, typename Dim, typename X, typename Out>
-  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
-                  bool reverse, bool exclusive) const {
-    if (!reverse) {
-      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
-    } else {
-      std::array<bool, Dim::count> rev;
-      rev.fill(false);
-      rev[axis] = reverse;
-      out.reshape(dims).device(d) =
-          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
-    }
-  }
-};
-
-template <typename T>
-struct CumsumFunctor {
-  using ELEMENT_TYPE = T;
-  template <typename X>
-  const typename X::TensorScanSumOp operator()(X x, int axis,
-                                               bool exclusive) const {
-    return x.cumsum(axis, exclusive);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 9fa355a924612651556f2a79711cae4ce17379f8..11633fb0b870327f14e4454b3f94a43940a9df53 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/cum_op.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,17 +24,6 @@ namespace operators {
 class CumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->Attrs().Get<bool>("flatten")) {
-      ctx->SetOutputDim("Out",
-                        phi::make_ddim({phi::product(ctx->GetInputDim("X"))}));
-    } else {
-      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    }
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,15 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-
+DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor,
+                            PD_INFER_META(phi::CumsumInferMeta));
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
-                  ops::CumsumGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int16_t>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
+                  ops::CumsumGradMaker<paddle::imperative::OpBase>,
+                  CumsumInferShapeFunctor);
 
 REGISTER_OP_VERSION(cumsum)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
deleted file mode 100644
index 3402f42521f54f315390fe2162309fb204fd9b00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumsum_op.cu
+++ /dev/null
@@ -1,325 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/reverse.h>
-#include <thrust/scan.h>
-#ifdef __NVCC__
-#include <cub/cub.cuh>
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-using Tensor = paddle::framework::Tensor;
-using LoDTensor = paddle::framework::LoDTensor;
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int BLOCK_SIZE>
-__device__ void BlockReverse(const T* idata, T* odata, int src_base,
-                             int dst_base, int valid_item) {
-  __shared__ T sh_mem[BLOCK_SIZE];
-  int tx = threadIdx.x;
-
-  int offset = tx;
-  int in_index = src_base + offset;
-  if (offset >= valid_item) {
-    sh_mem[offset] = 0;
-  } else {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    T data = idata[in_index];
-    sh_mem[sh_mem_index] = data;
-  }
-
-  __syncthreads();
-  int out_index = dst_base - offset;
-  if (offset < valid_item) {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    odata[out_index] = sh_mem[sh_mem_index];
-  }
-}
-
-template <typename T>
-__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data,
-                                 int reverse_size, int outer_size,
-                                 int inner_size) {
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-  int item_per_block = 1024;
-
-  for (int block_offset = 0; block_offset < reverse_size;
-       block_offset += item_per_block) {
-    int valid_item = (reverse_size - block_offset > item_per_block)
-                         ? item_per_block
-                         : reverse_size - block_offset;
-    int src_offset =
-        bx * reverse_size + block_offset + by * (inner_size * reverse_size);
-    int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) +
-                     reverse_size - 1 - block_offset;
-    if (reverse_size < item_per_block) {
-      valid_item = reverse_size;
-    }
-
-    BlockReverse<T, 1024>(matrix_data, reverse_data, src_offset, dst_offset,
-                          valid_item);
-  }
-}
-
-template <typename T>
-struct BlockPrefixCallbackOp {
-  // Running prefix
-  T running_total;
-  // Constructor
-  __device__ BlockPrefixCallbackOp(T running_total)
-      : running_total(running_total) {}
-  // Callback operator to be entered by the first warp of threads in the block.
-  // Thread-0 is responsible for returning a value for seeding the block-wide
-  // scan.
-  __device__ T operator()(T block_aggregate) {
-    T old_prefix = running_total;
-    running_total = old_prefix + block_aggregate;
-    return old_prefix;
-  }
-};
-
-// No bank-conflict transpose
-template <typename T, int TILE_DIM, int BLOCK_ROWS>
-__global__ void MatrixTranspose(T* odata, const T* idata, size_t height,
-                                size_t width) {
-  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
-
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
-    if (x < width && (y + j) < height) {
-      tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x];
-    } else {
-      tile[threadIdx.y + j][threadIdx.x] = 0;
-    }
-  }
-
-  __syncthreads();
-
-  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * TILE_DIM + threadIdx.y;
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
-    if (x < height && (y + j) < width) {
-      odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
-    }
-  }
-}
-
-template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
-__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size,
-                                int outer_size, int scan_size, bool exclusive) {
-  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
-  typedef cub::BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD,
-                         cub::BLOCK_LOAD_TRANSPOSE>
-      BlockLoadT;
-  typedef cub::BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD,
-                          cub::BLOCK_STORE_TRANSPOSE>
-      BlockStoreT;
-  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
-  // Allocate type-safe, repurposable shared memory for collectives
-  __shared__ union {
-    typename BlockLoadT::TempStorage load;
-    typename BlockStoreT::TempStorage store;
-    typename BlockScanT::TempStorage scan;
-  } temp_storage;
-
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-
-  BlockPrefixCallbackOp<T> prefix_op(0);
-  T block_aggregate = static_cast<T>(0);
-
-  // Obtain this block's segment of consecutive keys (blocked across threads)
-  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
-  for (int block_offset = 0; block_offset < scan_size;
-       block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) {
-    int valid_item = (scan_size - block_offset > item_per_block)
-                         ? item_per_block
-                         : (scan_size - block_offset);
-    if (scan_size < item_per_block) {
-      valid_item = scan_size;
-    }
-
-    int offset = bx * scan_size + block_offset + by * (inner_size * scan_size);
-
-    T thread_keys[ITEMS_PER_THREAD];
-    BlockLoadT(temp_storage.load)
-        .Load(d_in + offset, thread_keys, valid_item, 0);
-
-    __syncthreads();
-    if (exclusive) {
-      T init_value = static_cast<T>(0);
-      BlockScanT(temp_storage.scan)
-          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
-    } else {
-      BlockScanT(temp_storage.scan)
-          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
-    }
-    __syncthreads();
-
-    BlockStoreT(temp_storage.store)
-        .Store(d_out + offset, thread_keys, valid_item);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CumCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto out_dims = out->dims();
-    auto size = in->numel();
-
-    PADDLE_ENFORCE_EQ(
-        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(axis) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-            out_dims.size(), out_dims.size() - 1, axis));
-    if (axis < 0) {
-      axis += out_dims.size();
-    }
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* in_data = in->data<T>();
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    if (size == out_dims[axis]) {
-      if (reverse) {
-        thrust::device_ptr<const T> dev_ptr =
-            thrust::device_pointer_cast(in_data);
-        thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
-        if (exclusive) {
-          thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
-                                 out_data);
-        } else {
-          thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
-                                 out_data);
-        }
-        thrust::reverse(thrust::device, out_data, out_data + size);
-      } else {
-        if (exclusive) {
-          thrust::exclusive_scan(thrust::device, in_data, in_data + size,
-                                 out_data);
-        } else {
-          thrust::inclusive_scan(thrust::device, in_data, in_data + size,
-                                 out_data);
-        }
-      }
-      return;
-    }
-
-    size_t height = 1;
-    size_t width = 1;
-    for (size_t i = 0; i <= axis; i++) {
-      height *= out_dims[i];
-    }
-
-    for (size_t i = axis + 1; i < out_dims.size(); i++) {
-      width *= out_dims[i];
-    }
-    int scan_size = out_dims[axis];
-    bool transpose = (axis != out_dims.size() - 1);
-
-    int tile_size = 32;
-    dim3 blocks(32, 8);
-    dim3 transpose_grids((width + tile_size - 1) / tile_size,
-                         (height + tile_size - 1) / tile_size);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    framework::Tensor tmp;
-    tmp.Resize(out_dims);
-    auto* tmp_data = tmp.mutable_data<T>(context.GetPlace());
-    T* next_in_data = out_data;
-    T* next_out_data = tmp_data;
-    if (transpose) {
-      MatrixTranspose<T, 32,
-                      8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
-          out_data, in_data, height, width);
-      next_in_data = out_data;
-      next_out_data = tmp_data;
-    }
-    auto swap_ptr = [](T*& ptr1, T*& ptr2) {
-      T* tmp = ptr2;
-      ptr2 = ptr1;
-      ptr1 = tmp;
-    };
-    int outer_size = height / scan_size;
-    int inner_size = width;
-    // Consider the size of shared memory, here block size is 128
-    dim3 scan_grid(outer_size, inner_size);
-    dim3 reverse_grid = scan_grid;
-    if (reverse) {
-      if (transpose) {
-        reverse_grid.x = scan_grid.y;
-        reverse_grid.y = scan_grid.x;
-        MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-            next_in_data, next_out_data, scan_size, outer_size, inner_size);
-        if (!transpose) next_in_data = tmp_data;
-        swap_ptr(next_in_data, next_out_data);
-      } else {
-        MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-            in_data, out_data, scan_size, outer_size, inner_size);
-      }
-    }
-    if (!transpose && !reverse) {
-      BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-          out_data, in_data, outer_size, inner_size, scan_size, exclusive);
-
-    } else {
-      BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-          next_out_data, next_in_data, outer_size, inner_size, scan_size,
-          exclusive);
-    }
-    swap_ptr(next_in_data, next_out_data);
-    if (reverse) {
-      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-          next_in_data, next_out_data, scan_size, outer_size, inner_size);
-      swap_ptr(next_in_data, next_out_data);
-    }
-    if (transpose) {
-      transpose_grids.x = (height + tile_size - 1) / tile_size;
-      transpose_grids.y = (width + tile_size - 1) / tile_size;
-      MatrixTranspose<T, 32,
-                      8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
-          next_out_data, next_in_data, width, height);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cumsum, ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 38bf53ca0aa1a2dddca4ac2d2043de10fcdb7830..d197e4362e96976661ab891929b4503977f52ff0 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/cum_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1ebafa54598574ae9027a4887639a2a1d27448ea..568c7982cfc7c07b9c7f840ccaa32e4025225122 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
+detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index b361bc3ab75e8ad84bbf2a353230a90e01b99b74..f170fbbe4b534ed5f6bb97508048a72ac766de90 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index ce9ac3de4e78c2aa562718719b111c9c47376bc8..860fdd01794ccc9898332f6f0d0ba4e9c3e296d6 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -23,11 +23,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
     Tensor sorted_batch_id;
     sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
-    GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
-                   &sorted_batch_id);
+    phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
+    phi::funcs::GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
+                               &sorted_batch_id);
 
     Tensor batch_index_t;
     int* batch_idx_in =
@@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
         out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
         sizeof(int) * 8, dev_ctx.stream());
 
-    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
+    phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
     Tensor length_lod;
     int* length_lod_data =
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index a60f881ebf3e3bd825219dce1fb9f377d90c7a94..e5ae9a6ccbda5acbdb37d1190314c94ca4007c07 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -21,7 +21,6 @@ limitations under the License.*/
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
-    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiInput<framework::Tensor>("MultiLevelRoIsNum");
     int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
@@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
     num_per_batch.emplace_back(post_nms_topN - pre_idx);
     if (context.HasOutput("RoisNum")) {
-      auto* rois_num = context.Output<Tensor>("RoisNum");
+      auto* rois_num = context.Output<framework::Tensor>("RoisNum");
       int* rois_num_data =
           rois_num->mutable_data<int>({batch_size}, context.GetPlace());
       for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index c117fbd70f52827a724c07213cd020d1b58cce22..7ad25e003b491294287a62433b8bf494086a87c2 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -24,9 +24,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         start = end;
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
-        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+        phi::funcs::GPUGather<T>(dev_ctx, *fpn_rois, sub_idx,
+                                 multi_fpn_rois[i]);
       } else {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 628cbcd761186bd060fdcbd2b68fe8defec1bf17..5479e08c2a5efa96e64eca45d75af7a6a60a8862 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -28,10 +27,11 @@ namespace operators {
 
 const int kBoxDim = 4;
 
-inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+inline std::vector<size_t> GetLodFromRoisNum(
+    const framework::Tensor* rois_num) {
   std::vector<size_t> rois_lod;
   auto* rois_num_data = rois_num->data<int>();
-  Tensor cpu_tensor;
+  framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(rois_num->place())) {
     paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(),
                                       &cpu_tensor);
@@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<size_t> fpn_rois_lod;
     int fpn_rois_num;
     if (context.HasInput("RoisNum")) {
-      auto* rois_num = context.Input<Tensor>("RoisNum");
+      auto* rois_num = context.Input<framework::Tensor>("RoisNum");
       fpn_rois_lod = GetLodFromRoisNum(rois_num);
     } else {
       fpn_rois_lod = fpn_rois->lod().back();
@@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> num_rois_level(num_level, 0);
     std::vector<int> num_rois_level_integral(num_level + 1, 0);
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
@@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> restore_index_inter(fpn_rois_num, -1);
     // distribute the rois into different fpn level by target level
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       size_t cur_offset = fpn_rois_lod[i];
@@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
-    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiOutput<framework::Tensor>("MultiLevelRoIsNum");
     if (multi_rois_num.size() > 0) {
       int batch_size = fpn_rois_lod.size() - 1;
       for (int i = 0; i < num_level; ++i) {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index e6af1a5bbf71cf24cd355dc09cb439e0bc9fbfba..c9cc4e722071c69f0bf658ad69363dbdd75b63e4 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 424aa0714400d3c8a897f98b9209222aa61acef8..cbf17048400bfd967e311897bf8d6d6e11d6000b 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
   Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
   fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
   bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
   Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
+  phi::funcs::CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
   fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
+  phi::funcs::CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
   phi::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
   Tensor fg_max_overlap, bg_max_overlap;
   fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
   bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
   Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
 }
 
@@ -334,7 +334,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     } else {
       proposals_num = keep.numel();
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      CPUGather<T>(context, rpn_rois, keep, &roi_filter);
+      phi::funcs::CPUGather<T>(context, rpn_rois, keep, &roi_filter);
     }
     T* roi_filter_dt = roi_filter.data<T>();
     memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 8c4bd4ac61320356073107b7a109e3c27d6b41a1..d6130823271f05c83e590d28b41c3baf73e054f0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 6e3c322c1748353d4f447dd6a927e13c4d04025c..5fb7973fd89e49f1cc19458059bffe0dadb9aa3e 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -85,8 +86,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -102,8 +103,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 6351ea865cd0eb3891f2b4882a587b2feeb6c67a..1f1802574c5b82281b0a7ecc79d9057df61c37e6 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 93ba3deca5fc4f1b0247f90f21936faaaf9c0b43..005309e8ee577119fd295126c40b46a11a762497 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -86,8 +87,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -104,8 +105,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 7927410ef37862499aadf61d6e04c45af157f347..83cf6e5fd30f6bcad4870d1ebd18a50e21518dfe 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -93,7 +93,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
     if (score_size == 3) {
-      ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     } else {
       ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     }
@@ -545,11 +545,10 @@ class MultiClassNMS2Op : public MultiClassNMSOp {
   void InferShape(framework::InferShapeContext* ctx) const override {
     MultiClassNMSOp::InferShape(ctx);
 
-    auto box_dims = ctx->GetInputDim("BBoxes");
     auto score_dims = ctx->GetInputDim("Scores");
     auto score_size = score_dims.size();
     if (score_size == 3) {
-      ctx->SetOutputDim("Index", {box_dims[1], 1});
+      ctx->SetOutputDim("Index", {-1, 1});
     } else {
       ctx->SetOutputDim("Index", {-1, 1});
     }
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 48b0d511d902ce96e39c392cab661e19fa31f875..0d9fbf612f73c428fb8050fcfcc319ddafabe482 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,7 +9,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -102,7 +101,12 @@ class YoloBoxOp : public framework::OperatorWithKernel {
                           "But received class_num (%s)",
                           class_num));
 
-    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    int box_num;
+    if ((dim_x[2] > 0 && dim_x[3] > 0) || ctx->IsRuntime()) {
+      box_num = dim_x[2] * dim_x[3] * anchor_num;
+    } else {
+      box_num = -1;
+    }
     std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
     ctx->SetOutputDim("Boxes", phi::make_ddim(dim_boxes));
 
@@ -235,8 +239,6 @@ REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
-                       ops::YoloBoxKernel<double>);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
deleted file mode 100644
index fb5c214a59e1274ffc30226bf49a068df960f414..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh,
-                            const int* anchors, const int n, const int h,
-                            const int w, const int an_num, const int class_num,
-                            const int box_num, int input_size_h,
-                            int input_size_w, bool clip_bbox, const float scale,
-                            const float bias, bool iou_aware,
-                            const float iou_aware_factor) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  T box[4];
-  for (; tid < n * box_num; tid += stride) {
-    int grid_num = h * w;
-    int i = tid / box_num;
-    int j = (tid % box_num) / grid_num;
-    int k = (tid % grid_num) / w;
-    int l = tid % w;
-
-    int an_stride = (5 + class_num) * grid_num;
-    int img_height = imgsize[2 * i];
-    int img_width = imgsize[2 * i + 1];
-
-    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
-                                iou_aware);
-    T conf = sigmoid<T>(input[obj_idx]);
-    if (iou_aware) {
-      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
-      T iou = sigmoid<T>(input[iou_idx]);
-      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-             pow(iou, static_cast<T>(iou_aware_factor));
-    }
-    if (conf < conf_thresh) {
-      continue;
-    }
-
-    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
-                                iou_aware);
-    GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
-                  input_size_w, box_idx, grid_num, img_height, img_width, scale,
-                  bias);
-    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
-
-    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
-                                  5, iou_aware);
-    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
-    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
-                      grid_num);
-  }
-}
-
-template <typename T>
-class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* img_size = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = sizeof(int) * anchors.size();
-    auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
-    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
-    const auto gplace = ctx.GetPlace();
-    const auto cplace = platform::CPUPlace();
-    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
-                 dev_ctx.stream());
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = img_size->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, boxes, static_cast<T>(0));
-    set_zero(dev_ctx, scores, static_cast<T>(0));
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
-
-    dim3 thread_num = config.thread_per_block;
-#ifdef WITH_NV_JETSON
-    if (config.compute_capability == 53 || config.compute_capability == 62) {
-      thread_num = 512;
-    }
-#endif
-
-    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
-                     ctx.cuda_device_context().stream()>>>(
-        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
-        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
-                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
deleted file mode 100644
index 2cd69c60b7c44d0557c23b8d1bd933650e8402c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                  int j, int an_idx, int grid_size_h,
-                                  int grid_size_w, int input_size_h,
-                                  int input_size_w, int index, int stride,
-                                  int img_height, int img_width, float scale,
-                                  float bias) {
-  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
-  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
-           grid_size_h;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size_w;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size_h;
-}
-
-HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
-                                    int an_num, int an_stride, int stride,
-                                    int entry, bool iou_aware) {
-  if (iou_aware) {
-    return (batch * an_num + an_idx) * an_stride +
-           (batch * an_num + an_num + entry) * stride + hw_idx;
-  } else {
-    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-  }
-}
-
-HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
-                                  int an_stride, int stride) {
-  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
-         hw_idx;
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
-                                        const int img_height,
-                                        const int img_width, bool clip_bbox) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  if (clip_bbox) {
-    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-    boxes[box_idx + 1] =
-        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                             ? boxes[box_idx + 2]
-                             : static_cast<T>(img_width - 1);
-    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                             ? boxes[box_idx + 3]
-                             : static_cast<T>(img_height - 1);
-  }
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
-                                      const int label_idx, const int score_idx,
-                                      const int class_num, const T conf,
-                                      const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-class YoloBoxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* imgsize = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    Tensor anchors_;
-    auto anchors_data =
-        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
-    std::copy(anchors.begin(), anchors.end(), anchors_data);
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = imgsize->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    memset(boxes_data, 0, boxes->numel() * sizeof(T));
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    memset(scores_data, 0, scores->numel() * sizeof(T));
-
-    T box[4];
-    for (int i = 0; i < n; i++) {
-      int img_height = imgsize_data[2 * i];
-      int img_width = imgsize_data[2 * i + 1];
-
-      for (int j = 0; j < an_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 4, iou_aware);
-            T conf = sigmoid<T>(input_data[obj_idx]);
-            if (iou_aware) {
-              int iou_idx =
-                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
-              T iou = sigmoid<T>(input_data[iou_idx]);
-              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-                     pow(iou, static_cast<T>(iou_aware_factor));
-            }
-            if (conf < conf_thresh) {
-              continue;
-            }
-
-            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 0, iou_aware);
-            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
-                          input_size_h, input_size_w, box_idx, stride,
-                          img_height, img_width, scale, bias);
-            box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
-                                clip_bbox);
-
-            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                          stride, 5, iou_aware);
-            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
-            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
-                              class_num, conf, stride);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 375ef4344f4741c947ef3134696d64cdae696780..f89ecd37222870f73d00870c9454bf5590d504e3 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,11 +19,17 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -172,7 +178,7 @@ template <typename DeviceContext, typename T>
 class DeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* det = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
       // checked in forward, pass
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (det(A)=0)
     if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       ddet->Resize(input->dims());
-      ddet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, ddet, static_cast<T>(0.0f));
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
+                   ddet);
       return;
     }
 
@@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
     // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
     // -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
     VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
     // Third: dA * |A|
-    auto mul_dA_detA = helper.Mul(*grad, *det);
+    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
     VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
 
     // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
 
     VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
 
@@ -331,7 +340,7 @@ template <typename DeviceContext, typename T>
 class SlogDeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* slogdet = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
               input->dims().size() - grad->dims().size()));
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
@@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
-      dslogdet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()),
+                   std::numeric_limits<T>::quiet_NaN(), dslogdet);
       return;
     }
 
@@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
     // inverse(A).conj().transpose(-2, -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).conj()
-    framework::Tensor conj_inverse_A;
-    conj_inverse_A.Resize(inverse_A.dims());
-    auto numel = input->numel();
-    auto* conj_data = conj_inverse_A.mutable_data<T>(context.GetPlace(),
-                                                     size_t(numel * sizeof(T)));
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
-    for_range(functor);
+    auto conj_inverse_A = phi::Conj<T>(dev_ctx, inverse_A);
 
     VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
 
     // Third: inverse(A).conj().transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, conj_inverse_A);
     VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
@@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     det_grad.Resize(det_grad.dims().reshape(det_grad_vec));
 
     // Fifth: unsqueeze(dslA, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(det_grad, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dslA) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
     VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();
 
     framework::TensorCopy(res, context.GetPlace(), dslogdet);
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 0160277dc79af50c555b1257e6ffa216b7b56b62..93fbff67e220bcf7d1f8dab112a07cc42649595f 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -62,8 +62,8 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
-                            PT_INFER_META(phi::DiagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
+                            PD_INFER_META(phi::DiagInferMeta));
 
 REGISTER_OPERATOR(
     diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc
index b419f629a1e635c5a463b732af3003e93a5674d6..bf3cc941539eaeb2e03f53eb2465532469be5697 100644
--- a/paddle/fluid/operators/diagonal_op.cc
+++ b/paddle/fluid/operators/diagonal_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,74 +23,6 @@ namespace operators {
 class DiagonalOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal");
-
-    int offset_ = ctx->Attrs().Get<int>("offset");
-    int axis1 = ctx->Attrs().Get<int>("axis1");
-    int axis2 = ctx->Attrs().Get<int>("axis2");
-
-    auto x_dims = ctx->GetInputDim("Input");
-    int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
-    int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
-
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::OutOfRange("Input's dim is out of range (expected at "
-                                     "least 2 dimensions, but got %ld).",
-                                     x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis1_, x_dims.size(),
-        platform::errors::OutOfRange(
-            "Attr(axis1) is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size()), (x_dims.size() - 1), axis1));
-    PADDLE_ENFORCE_LT(
-        axis2_, x_dims.size(),
-        platform::errors::OutOfRange(
-            "Attr(axis2) is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size()), (x_dims.size() - 1), axis2));
-    PADDLE_ENFORCE_NE(axis1_, axis2_,
-                      platform::errors::InvalidArgument(
-                          "The dimensions should not be identical "
-                          "%d vs %d.",
-                          axis1, axis2));
-
-    auto out_dims = vectorize(x_dims);
-    // from out_dims get the dim size of axis1_.
-    auto axis1_size = out_dims[axis1_];
-    auto axis2_size = out_dims[axis2_];
-    // delete two dims by attr axis1 and axis2 from out_dims.
-    /* example:
-       out_dim = [2, 3, 4];
-       axis1 = 0;
-       axis2 = 1;
-       according to the attr of axis1 and axis2, we get:
-       out_dim = [4].
-    */
-    out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
-    out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
-
-    if (offset_ == 0) {
-      out_dims.push_back(std::min(axis1_size, axis2_size));
-    } else if (offset_ > 0) {
-      if ((axis2_size - offset_) > 0) {
-        out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
-      } else {
-        out_dims.push_back(0);
-      }
-    } else {
-      if ((axis1_size + offset_) > 0) {
-        out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
-      } else {
-        out_dims.push_back(0);
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-  }
 };
 
 class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -170,9 +105,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
+                            PD_INFER_META(phi::DiagonalInferMeta));
+
 REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker,
                   ops::DiagonalGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>,
+                  DiagonalInferShapeFunctor);
 
 REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp,
                   ops::DiagonalGradNoNeedBufferVarsInferer)
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 3a53f1365567f99c9446077f7939d87c156c9a08..55b2484941293c8db47ef847bea959ebe82ff3ae 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor,
+                            PD_INFER_META(phi::DistInferMeta));
+
 REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker,
                   ops::DistGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DistGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DistGradOpMaker<paddle::imperative::OpBase>,
+                  DistInferShapeFunctor);
 REGISTER_OPERATOR(dist_grad, ops::DistOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/fluid/operators/dist_op.cu
deleted file mode 100644
index 90674969e283f1cba816ad46802cdbf971bcc555..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dist_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/dist_op.h"
-
-namespace ops = paddle::operators;
-#ifdef PADDLE_WITH_HIP
-// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
-// do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
-#endif
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
deleted file mode 100644
index dfd7e29a8d0102261746ab47d3e1e805a674d7b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dist_op.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-template <int Rank>
-static void GetBraodcastDims(const framework::DDim& x_dims,
-                             const framework::DDim& y_dims,
-                             Eigen::DSizes<int, Rank>* x_bcast_dims,
-                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
-  int bcast_dims_remainder = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (x_dims[i] >= y_dims[i]) {
-      (*x_bcast_dims)[i] = 1;
-      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
-      bcast_dims_remainder += x_dims[i] % y_dims[i];
-    } else {
-      (*y_bcast_dims)[i] = 1;
-      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-  }
-  PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
-                    platform::errors::PreconditionNotMet(
-                        "The input tensor of Op(dist) could not be broadcast, "
-                        "X's shape is [%s], Y's shape is [%s].",
-                        x_dims, y_dims));
-}
-
-static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) {
-  std::vector<int64_t> new_dims_vec(rank);
-  if (in_dims.size() < rank) {
-    for (int i = 0; i < rank - in_dims.size(); ++i) {
-      new_dims_vec[i] = 1;
-    }
-    for (int i = 0; i < in_dims.size(); ++i) {
-      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
-    }
-  } else {
-    new_dims_vec = vectorize(in_dims);
-  }
-  return phi::make_ddim(new_dims_vec);
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Output<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-  out->mutable_data<T>(context.GetPlace());
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-
-  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, 1>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  // p=0 means number of non-zero elements of (x-y)
-  // p=inf means the maximum of |x-y|
-  // p=-inf means the minimum of |x-y|
-  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
-  if (p == 0) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
-            .template cast<T>()
-            .sum();
-  } else if (p == INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .maximum();
-  } else if (p == -INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .minimum();
-  } else {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .pow(p)
-            .sum()
-            .pow(1.0 / p);
-  }
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistGradFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Input<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-
-  auto x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-  auto y_grad = context.Output<Tensor>(framework::GradVarName("Y"));
-  auto out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-  auto out_dims = context.Input<Tensor>("Out")->dims();
-
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-  framework::DDim out_new_dims = GetNewDims(out_dims, Rank);
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, Rank>::From(*out, out_new_dims);
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  Eigen::DSizes<int, Rank> out_bcast_dims;
-
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  std::vector<int64_t> new_dims_vec(Rank);
-  for (int i = 0; i < Rank; ++i) {
-    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
-    out_bcast_dims[i] = new_dims_vec[i];
-  }
-  framework::DDim new_dims = phi::make_ddim(new_dims_vec);
-
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  auto out_grad_t = EigenTensor<T, Rank>::From(*out_grad, out_new_dims);
-  framework::Tensor grad;
-  grad.mutable_data<T>(new_dims, context.GetPlace());
-  auto grad_t = EigenTensor<T, Rank>::From(grad);
-
-  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
-  auto x_minux_y_abs = x_minux_y.abs();
-  auto sign =
-      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
-      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
-  T epsilon = static_cast<T>(1.0e-10f);
-
-  // 1: Lp-norm(z), z = x-y, compute dz
-  if (p == 0) {
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, &grad, static_cast<T>(0));
-  } else if (p == INFINITY || p == -INFINITY) {
-    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
-    // j!=i, or equals to sign(z_i) * dout if j=i.
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  } else {
-    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  }
-
-  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
-  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
-  Eigen::DSizes<int, Rank> reduce_dims;
-  for (int i = 0; i < x_new_dims.size(); ++i) {
-    x_reshape_dims[2 * i] = x_bcast_dims[i];
-    x_reshape_dims[2 * i + 1] = x_new_dims[i];
-    y_reshape_dims[2 * i] = y_bcast_dims[i];
-    y_reshape_dims[2 * i + 1] = y_new_dims[i];
-    reduce_dims[i] = 2 * i;
-  }
-
-  // 2: if x or y is broadcasted in forward function,
-  // the grad need to be sum along the broadcasted dimensions
-  if (x_grad) {
-    x_grad->mutable_data<T>(context.GetPlace());
-    auto x_grad_t = EigenTensor<T, Rank>::From(*x_grad, x_new_dims);
-    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(x_grad_t.dimensions());
-  }
-  if (y_grad) {
-    y_grad->mutable_data<T>(context.GetPlace());
-    auto y_grad_t = EigenTensor<T, Rank>::From(*y_grad, y_new_dims);
-    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
-                                  .sum(reduce_dims)
-                                  .reshape(y_grad_t.dimensions());
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DistKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DistGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistGradFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index ed2b09796eeeb8ce18fdc47be58347d85e6e1a80..8efdd15781a6f2dab48c0680ba87c7b427dc60ec 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/dot_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(X) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Y) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
-                      platform::errors::PreconditionNotMet(
-                          "Output(Out) of DotOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = static_cast<size_t>(x_dims.size());
-    PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The dimensions of input tensor X (%s) "
-                          "should be 1 or 2",
-                          x_dims.to_str()));
-
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        true, x_rank == (size_t)y_dims.size(),
-        platform::errors::PreconditionNotMet(
-            "ShapeError: The shape of input tensor Y: %s should match with "
-            "input tenosr X: %s",
-            y_dims.to_str(), x_dims.to_str()));
-    bool shape_match = true;
-    for (size_t i = 0; i < x_rank; ++i) {
-      if (x_dims[i] != y_dims[i]) {
-        shape_match = false;
-        break;
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(true, shape_match,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The shape of input tensor X: %s should "
-                          "be exactly the same "
-                          "with input tensor Y: %s",
-                          x_dims.to_str(), y_dims.to_str()));
-    auto dims = vectorize(x_dims);
-    dims[dims.size() - 1] = 1;
-    ctx->SetOutputDim("Out", phi::make_ddim(dims));
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PD_INFER_META(phi::DotInferMeta));
+
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                   ops::DotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::DotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::DotOpGradMaker<paddle::imperative::OpBase>,
+                  DotInferShapeFunctor);
 
 REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
 
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90..144198367d538e178a745c22902bb77a65f45fe4 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,10 +32,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
@@ -86,8 +85,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                           bool is_upscale_in_train,
                                           uint64_t increment) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
 #ifdef PADDLE_WITH_HIP
   int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -102,7 +101,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
   MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
   for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
     LoadT src_val;
-    platform::Load<T, VecSize>(&src[i], &src_val);
+    phi::Load<T, VecSize>(&src[i], &src_val);
 
 #ifdef PADDLE_WITH_HIP
     float4 rand = hiprand_uniform4(&state);
@@ -126,8 +125,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
       }
     }
 
-    platform::Store<T, VecSize>(dst_val, &dst[i]);
-    platform::Store<MaskType, VecSize>(mask_val, &mask[i]);
+    phi::Store<T, VecSize>(dst_val, &dst[i]);
+    phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
   }
 }
 
@@ -153,16 +152,16 @@ __global__ void DropoutGradCUDAKernel(
     const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
     T* dx) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_val;
-    platform::Load<T, VecSize>(&dout[i], &dout_val);
+    phi::Load<T, VecSize>(&dout[i], &dout_val);
 
     MaskLoadT mask_val;
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_val);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
 
     LoadT dx_val;
 
@@ -172,27 +171,28 @@ __global__ void DropoutGradCUDAKernel(
                                  static_cast<MT>(mask_val[j]) * factor);
     }
 
-    platform::Store<T, VecSize>(dx_val, &dx[i]);
+    phi::Store<T, VecSize>(dx_val, &dx[i]);
   }
 }
 
 template <typename T>
-void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              bool is_test,
+void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
                               const std::string dropout_implementation,
                               float dropout_prob, bool upscale_in_train,
-                              bool is_fix_seed, int seed_val, const Tensor& x,
-                              const Tensor* seed, Tensor* mask, Tensor* y) {
+                              bool is_fix_seed, int seed_val,
+                              const framework::Tensor& x,
+                              const framework::Tensor* seed,
+                              framework::Tensor* mask, framework::Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
+  int64_t x_numel = x.numel();
+  auto stream = dev_ctx.stream();
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
 
   if (!is_test) {
-    int64_t x_numel = x.numel();
-    auto stream = dev_ctx.stream();
     auto* mask_data = mask->data<uint8_t>();
     size_t size = phi::product(mask->dims());
 
-    auto* x_data = x.data<T>();
-    auto* y_data = y->data<T>();
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -219,8 +219,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     uint64_t increment;
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
-    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+    int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
         ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
 
@@ -254,22 +255,37 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     }
 #endif
   } else {
-    auto X = EigenMatrix<T>::Reshape(x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
     if (upscale_in_train) {
-      Y.device(place) = X;
+// todo: can y share with data with x directly?
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                         hipMemcpyDeviceToDevice, stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                          cudaMemcpyDeviceToDevice, stream));
+#endif
     } else {
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      using MT = typename details::MPTypeTrait<T>::Type;
+      MT factor = static_cast<MT>(1.0f - dropout_prob);
+      std::vector<const framework::Tensor*> ins = {&x};
+      std::vector<framework::Tensor*> outs = {y};
+      auto functor = phi::funcs::ScaleFunctor<T>(factor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 }
 
 template <typename T>
-void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
+void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
-                                float dropout_prob, const Tensor& grad_y,
-                                const Tensor& mask, int64_t size,
-                                Tensor* grad_x, bool is_test = false) {
+                                float dropout_prob,
+                                const framework::Tensor& grad_y,
+                                const framework::Tensor& mask, int64_t size,
+                                framework::Tensor* grad_x,
+                                bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
   auto stream = dev_ctx.stream();
   MT factor;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index d7db7dddce3887ca25ea1df34048f15663b2e987..c62d45570ba291dc60120c393d21842cc6548c61 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
+inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const framework::Tensor* seed,
                                     const bool is_fix_seed, const int seed_val,
                                     const int offset, uint64_t* seed_data,
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 7613b04bccfdc2084decc0b383eec199f7e10991..6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
                   ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
deleted file mode 100644
index f6ddff1d0327d3c7961781f875da69f89df1edec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx = context.cuda_device_context();
-    auto* mask = context.Output<Tensor>("Mask");
-    mask->mutable_data<uint8_t>(context.GetPlace());
-
-    bool is_fix_seed = context.Attr<bool>("fix_seed");
-    int seed_val = context.Attr<int>("seed");
-    DropoutFwGPUKernelDriver<T>(dev_ctx, is_test, dropout_implementation,
-                                dropout_prob, upscale_in_train, is_fix_seed,
-                                seed_val, *x, seed, mask, y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUDropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-    auto size = grad_x->numel();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    DropoutGradGPUKernelDriver<T>(dev_ctx, dropout_implementation, dropout_prob,
-                                  *grad_y, *mask, size, grad_x, is_test);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
deleted file mode 100644
index ea6ed0e61947470c22f18e47acce2fca4cb9c41f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = phi::product(mask->dims());
-
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
-        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
-        return;
-      }
-      // std::minstd_rand engine;
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      int seed_data = 0;
-      if (seed) {
-        seed_data = *(seed->data<int>());
-      } else {
-        seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
-      }
-      auto engine = framework::GetCPURandomEngine(seed_data);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(*engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          if (upscale_in_train) {
-            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
-          } else {
-            y_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
-      } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto dX = EigenVector<T>::Flatten(*grad_x);
-    auto dY = EigenVector<T>::Flatten(*grad_y);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    if (context.Attr<bool>("is_test") == true) {
-      if (dropout_implementation == "upscale_in_train") {
-        dX.device(place) = static_cast<T>(1) * dY;
-      } else {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
-      }
-    } else {
-      auto M = EigenVector<uint8_t>::Flatten(*mask);
-      if (dropout_implementation == "upscale_in_train") {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        if (dropout_prob == 1.0f) {
-          dX.device(place) = static_cast<T>(0) * dY;
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
-      } else {
-        dX.device(place) = dY * M.cast<T>();
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 6aae566760623c666f3ce82a890a119e3e173390..07b3b5381162575cbfc03dd8cc10d0c88a2d21e8 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 206d9a6c5e9c9869216f0a6c137accc931aa2a77..bdf08646f1d8b94d6d8d141d8a9fa9864cdc937b 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 07b7e2cc7c09b09d6640f49fce438d58d0cc9cf2..7d8660f238abc8446b2988aad24a64c565e01ef9 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/dropout_op.h"
+
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
 #ifdef PADDLE_WITH_XPU
 
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class DropoutXPUKernel : public framework::OpKernel<T> {
   using XPUTyp = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 03b25c6705ac562c57cc905766dd8062ebcb741d..5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -18,12 +18,19 @@
 #include <algorithm>
 #include <complex>
 #include "paddle/fluid/operators/math/matrix_solve.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
 #define EPSILON 1e-6
 
 namespace paddle {
@@ -87,19 +94,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
   int values_stride = values->dims()[values->dims().size() - 1];
 
   Tensor rwork;
-  phi::funcs::Real<T>* rwork_data = nullptr;
+  phi::dtype::Real<T>* rwork_data = nullptr;
 
   rwork.Resize(phi::make_ddim({lda * 2}));
-  rwork_data = rwork.mutable_data<phi::funcs::Real<T>>(context.GetPlace());
+  rwork_data = rwork.mutable_data<phi::dtype::Real<T>>(context.GetPlace());
 
   // call lapackEig once to compute the size of work;
   T computed_work_size;
-  phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
       jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl,
       rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info);
 
   lwork = std::max<int>(
-      1, static_cast<int>(phi::funcs::Real<T>(computed_work_size)));
+      1, static_cast<int>(phi::dtype::Real<T>(computed_work_size)));
   Tensor work;
   work.Resize(phi::make_ddim({lwork}));
   T* work_data = work.mutable_data<T>(context.GetPlace());
@@ -109,7 +116,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
     T* current_values = &values_data[i * values_stride];
     T* current_rvectors = &rvector_data[i * matrix_stride];
 
-    phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
         jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data,
         ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info);
     PADDLE_ENFORCE_EQ(
@@ -207,23 +214,28 @@ class EigKernel : public framework::OpKernel<T> {
       origin_dim.push_back(last_item * 2);
       framework::DDim big_dim = phi::make_ddim(origin_dim);
 
-      real_values.mutable_data<phi::funcs::Real<T>>(big_dim,
+      real_values.mutable_data<phi::dtype::Real<T>>(big_dim,
                                                     context.GetPlace());
-      real_vectors.mutable_data<phi::funcs::Real<T>>(x->dims(),
+      real_vectors.mutable_data<phi::dtype::Real<T>>(x->dims(),
                                                      context.GetPlace());
 
-      ApplyEigKernel<DeviceContext, phi::funcs::Real<T>>(
+      ApplyEigKernel<DeviceContext, phi::dtype::Real<T>>(
           *x, &real_values, &real_vectors, context);
-      auto dito = math::DeviceIndependenceTensorOperations<
-          DeviceContext, phi::funcs::Real<T>, Tout>(context);
+
+      auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+      auto& dev_ctx = static_cast<
+          const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+          orig_dev_ctx);
 
       // 1. extract real part & imag part from real_values
-      Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
-      Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
+      Tensor real_part =
+          phi::funcs::Slice<T>(dev_ctx, real_values, {-1}, {0}, {order});
+      Tensor imag_part = phi::funcs::Slice<T>(dev_ctx, real_values, {-1},
+                                              {order}, {order * 2});
 
       // 2. construct complex values
-      auto* real_part_data = real_part.data<phi::funcs::Real<T>>();
-      auto* imag_part_data = imag_part.data<phi::funcs::Real<T>>();
+      auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
+      auto* imag_part_data = imag_part.data<phi::dtype::Real<T>>();
       int out_values_numel = out_values->numel();
       platform::ForRange<DeviceContext> for_range(
           context.template device_context<DeviceContext>(), out_values_numel);
@@ -233,10 +245,11 @@ class EigKernel : public framework::OpKernel<T> {
       for_range(functor);
 
       // 3. construct complex vectors
-      Tensor real_vector_trans = dito.Transpose(real_vectors);
+      Tensor real_vector_trans =
+          phi::TransposeLast2Dim<T>(dev_ctx, real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
-      ConstructComplexVectors<phi::funcs::Real<T>, Tout>(
+      ConstructComplexVectors<phi::dtype::Real<T>, Tout>(
           &out_vectors_trans, *out_values, real_vector_trans, context,
           batch_count, order);
       TransposeTwoAxis<DeviceContext, Tout>(out_vectors_trans, out_vectors,
@@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename Tout>
+template <typename DeviceContext, typename T>
 void ComputeBackwardForComplexInput(
     const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV,
-    Tout* x_grad_data, int batch_count, int order,
+    T* x_grad_data, int batch_count, int order,
     const framework::ExecutionContext& context) {
-  auto dito =
-      math::DeviceIndependenceTensorOperations<DeviceContext, Tout, Tout>(
-          context);
-
-  Tensor trans_v = dito.Transpose(V);
-  Tensor Vh = dito.Conj(trans_v);
-  Tensor Lconj = dito.Conj(L);
-  Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1));
-  Tensor VhgV = dito.Matmul(Vh, gV);
-  Tensor diag_real = dito.Real(VhgV);
-  Tensor diag_res = dito.BatchDiag(diag_real, batch_count);
-  Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2);
+  auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+  auto& dev_ctx = static_cast<
+      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+      orig_dev_ctx);
+
+  Tensor trans_v = phi::TransposeLast2Dim<T>(dev_ctx, V);
+  Tensor Vh = phi::Conj<T>(dev_ctx, trans_v);
+  Tensor Lconj = phi::Conj<T>(dev_ctx, L);
+  Tensor Econj = phi::Subtract<T>(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2),
+                                  phi::funcs::Unsqueeze(Lconj, -1));
+  Tensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV);
+  Tensor diag_real = phi::Real<T>(dev_ctx, VhgV);
+  Tensor diag_res = phi::funcs::BatchDiag<T>(dev_ctx, diag_real, batch_count);
+  Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
 
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<phi::funcs::Real<Tout>>();
-  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
+  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<T>>();
+  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<T>(
       diag_unsqueezed.dims(), context.GetPlace(),
-      static_cast<size_t>(numel * sizeof(Tout)));
-  auto& dev_ctx = context.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  phi::funcs::RealToComplexFunctor<Tout> functor(data_diag_un, data_diag_un_com,
-                                                 numel);
+      static_cast<size_t>(numel * sizeof(T)));
+
+  platform::ForRange<DeviceContext> for_range(orig_dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(data_diag_un, data_diag_un_com,
+                                              numel);
   for_range(functor);
   // real tensor multiply complex tensor in broadcast manner
-  Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex);
-  Tensor res2 = dito.Matmul(Vh, res1);
-  Tensor result = dito.Sub(VhgV, res2);
+  Tensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
+  Tensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
+  Tensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
 
-  result.mutable_data<Tout>(V.dims(), context.GetPlace());
-  result = dito.Div(result, Econj);
-  result = dito.DiagFill(order, order, order, 0, gL, result);
-  Tensor rhs = dito.Matmul(result, Vh);
+  result.mutable_data<T>(V.dims(), context.GetPlace());
+  result = phi::Divide<T>(dev_ctx, result, Econj);
+  result =
+      phi::funcs::DiagFill<T, T>(dev_ctx, order, order, order, 0, gL, result);
+  Tensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
 
   // solve linear system
   // solve(Vh, rhs, out, m, k)
@@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput(
   // x_grad: out
   int m = Vh.dims()[Vh.dims().size() - 1];
   int k = rhs.dims()[rhs.dims().size() - 1];
-  auto* matrix_data = Vh.data<Tout>();
-  auto* rhs_data = rhs.data<Tout>();
-  math::SolveLinearSystem<Tout>(matrix_data, rhs_data, x_grad_data, m, k,
-                                batch_count);
+  auto* matrix_data = Vh.data<T>();
+  auto* rhs_data = rhs.data<T>();
+  math::SolveLinearSystem<T>(matrix_data, rhs_data, x_grad_data, m, k,
+                             batch_count);
 }
 
 template <typename DeviceContext, typename T, typename Tout>
diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc
index 553d0e679cc6ddebd68c3edbc2de70209364bb53..4e33c567eb6d12fc504bfd76bc83072836feda21 100644
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eigh_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,42 +25,9 @@ using framework::Tensor;
 class EighOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues",
-                   "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors",
-                   "Eigh");
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto rank = input_dim.size();
-
-    PADDLE_ENFORCE_GE(rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions."
-                          "But received a %d dimension tensor.",
-                          rank));
-    PADDLE_ENFORCE_EQ(
-        input_dim[rank - 2], input_dim[rank - 1],
-        platform::errors::InvalidArgument(
-            "Eigh op is designed for square matrix, consequently"
-            "inner-most 2 dimensions of Input(X) should be symmetric."
-            "But received X's shape[-2] = %d and shape[-1] = %d.",
-            input_dim[rank - 2], input_dim[rank - 1]));
-
-    std::vector<int64_t> values_dim;
-
-    for (auto i = 0; i < rank - 1; i++) {
-      values_dim.emplace_back(input_dim[i]);
-    }
-
-    ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim));
-    ctx->SetOutputDim("Eigenvectors", input_dim);
-  }
 };
 
-class EignOpMaker : public framework::OpProtoAndCheckerMaker {
+class EighOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
@@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor,
+                            PD_INFER_META(phi::EighInferMeta));
 
-REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker,
+REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker,
                   ops::EighGradOpMaker<paddle::framework::OpDesc>,
-                  ops::EighGradOpMaker<paddle::imperative::OpBase>);
+                  ops::EighGradOpMaker<paddle::imperative::OpBase>,
+                  EighInferShapeFunctor);
 REGISTER_OPERATOR(eigh_grad, ops::EighGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu
deleted file mode 100644
index 827c551637d4df24529508ff37e6a92f157658a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eigh_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eigh_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
deleted file mode 100644
index 294794877b32e5fe2522080a4d388d20564486b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eigh_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/eigen_values_vectors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EighKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
-    auto output_w = ctx.Output<Tensor>("Eigenvalues");
-    auto output_v = ctx.Output<Tensor>("Eigenvectors");
-    std::string lower = ctx.Attr<std::string>("UPLO");
-    bool is_lower = (lower == "L");
-    math::MatrixEighFunctor<DeviceContext, T> functor;
-    functor(ctx, *input, output_w, output_v, is_lower, true);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class EighGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = phi::funcs::Real<T>;
-    auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    x_grad.mutable_data<T>(ctx.GetPlace());
-    auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
-    auto& output_v = *ctx.Input<Tensor>("Eigenvectors");
-    auto& output_w_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvalues"));
-    auto& output_v_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvectors"));
-
-    auto& dims = output_v.dims();
-    const int m = dims[dims.size() - 1];
-    auto dito =
-        math::DeviceIndependenceTensorOperations<DeviceContext, T, ValueType>(
-            ctx);
-    auto tV = dito.Transpose(dito.Conj(output_v));
-    auto W = dito.template Sub<ValueType>(dito.Unsqueeze(output_w, -2),
-                                          dito.Unsqueeze(output_w, -1));
-    Tensor result = dito.Matmul(tV, output_v_grad);
-    result.mutable_data<T>(dims, ctx.GetPlace());
-    std::vector<int> out_shape = phi::vectorize<int>(dims);
-    auto constant = dito.Fill(out_shape, 0.5);
-    result = dito.Sub(result, dito.Conj(dito.Transpose(result)));
-    result = dito.Mul(result, constant);
-    result = dito.Div(result, W);
-    result = dito.DiagFill(m, m, m, 0, output_w_grad, result);
-    x_grad = dito.Matmul(output_v, dito.Matmul(result, tV));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
index 59eabfb29b97ee66ad470ff4e0ed65f6b5db76f4..4627acc0d07defcd0f6fc6dd82aaaac8c0f148ca 100644
--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -48,7 +48,7 @@ struct PaddleComplex<
 template <typename T>
 using PaddleCType = typename PaddleComplex<T>::type;
 template <typename T>
-using Real = typename phi::funcs::Real<T>;
+using Real = typename phi::dtype::Real<T>;
 
 static void SpiltBatchSquareMatrix(const Tensor& input,
                                    std::vector<Tensor>* output) {
@@ -144,7 +144,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_work_mem, work_mem));
 
   int64_t rwork_mem = rwork->memory_size();
-  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::funcs::Real<T>);
+  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real<T>);
   PADDLE_ENFORCE_GE(
       rwork_mem, required_rwork_mem,
       platform::errors::InvalidArgument(
@@ -154,11 +154,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_rwork_mem, rwork_mem));
 
   int info = 0;
-  phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
       'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
       static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
       work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
-      rwork->template data<phi::funcs::Real<T>>(), &info);
+      rwork->template data<phi::dtype::Real<T>>(), &info);
 
   std::string name = "framework::platform::dynload::cgeev_";
   if (framework::TransToProtoVarType(input.dtype()) ==
@@ -188,10 +188,10 @@ class EigvalsKernel : public framework::OpKernel<T> {
     // query workspace size
     T qwork;
     int info;
-    phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
         'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(),
         static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1,
-        static_cast<Real<T>*>(NULL), &info);
+        static_cast<phi::dtype::Real<T>*>(NULL), &info);
     int64_t lwork = static_cast<int64_t>(qwork);
 
     Tensor work, rwork;
@@ -208,7 +208,7 @@ class EigvalsKernel : public framework::OpKernel<T> {
     }
     if (framework::IsComplexType(
             framework::TransToProtoVarType(input->dtype()))) {
-      rwork.mutable_data<phi::funcs::Real<T>>(phi::make_ddim({n_dim << 1}),
+      rwork.mutable_data<phi::dtype::Real<T>>(phi::make_ddim({n_dim << 1}),
                                               ctx.GetPlace());
     }
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
index d6e0749318e901947b46b4b1d6ff8bbdb16bef36..3b7457d72e15d733a45bc10ea433db1937dbac89 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #else
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #endif
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 38cd232e4d1d2237cb5da014d11ba69a91cbe917..13fd9b81a8765aea140ad6ca2fc0383151a51dc7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -102,42 +102,6 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad,
                   ops::ElementwiseDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_div)
     .AddCheckpoint(
         R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
deleted file mode 100644
index 9eb4b0352e5337e3fdd758d2e95cfa61d1d62724..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  const auto place = ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::float16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index c58a7f36548a57a1c8e7770fa282470fba4cc140..e9adb9abdb528c187817be641b81ffb6f64833b0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -20,142 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-void default_elementwise_sub(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          SubFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseSubFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void default_elementwise_div(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          DivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseDivFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
-    phi::DivideRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *pt_x.get(), *pt_y.get(), axis, pt_z.get());
-  }
-};
-
-template <typename T>
-struct DivGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
-};
-
-template <typename T>
-struct DivGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout / y_conj;
-  }
-};
-
-template <typename T>
-struct DivGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * out / y;
-  }
-};
-
-template <typename T>
-struct DivGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> out_div_y_conj((out / y).real,
-                                                -(out / y).imag);
-    return -dout * out_div_y_conj;
-  }
-};
-
-template <typename T>
-struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-
-  ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseDivGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::Tensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Input<Tensor>("Out");
-    auto* ddX = ctx.Input<Tensor>("DDX");
-    auto* ddY = ctx.Input<Tensor>("DDY");
-    auto* dX = ctx.Input<Tensor>("DX");
-
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dOut = ctx.Output<Tensor>("DOut");
-    auto* ddOut = ctx.Output<Tensor>("DDOut");
-
-    int axis = ctx.Attr<int>("axis");
-
-    if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    // ddX_safe == null ? 0 : ddX
-    // ddY_safe == null ? 0 : ddY
-    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dX, ddX, &ddX_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
-
-    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    // dOut = - dX * ddY
-    // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
-    // inplace ddx
-    Tensor tmp;
-    if (dOut) {
-      tmp = *dOut;
-    } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      tmp = ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
-    }
-    if (dY) {
-      // dX_div_Y = dX / Y;
-      Tensor dX_div_Y = tmp;
-      default_elementwise_div<DeviceContext, T>(ctx, dX, Y, &dX_div_Y);
-
-      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-
-      // dY = Out * dX * ddY / Y - dX * ddX / Y
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
-          ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
-          DivGradDX<T>(), DivDoubleDY<T>());
-    }
-
-    if (ddOut) {
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
-      default_elementwise_sub<DeviceContext, T>(ctx, &ddX_safe, &tmp, &tmp);
-      default_elementwise_div<DeviceContext, T>(ctx, &tmp, Y, ddOut);
-    }
-
-    if (dOut) {
-      // dOut = - dX * ddY
-      default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*dOut);
-      dout.device(place) = static_cast<T>(-1) * dout;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 86f5be3071c2d1a84f13da1cef74787003e633bb..14baeaa74d2421135401e94fbc10367d50b876fe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -90,67 +90,6 @@ struct MinFunctor {
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
-template <typename InT, typename OutT>
-struct DivGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    // dx = dout / y
-    // dy = - dout * out / y
-    phi::Array<OutT, 2> outs;
-    outs[0] = a / c;
-    outs[1] = -a * b / c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    Complex<InT> c_conj(c.real, -c.imag);
-    Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
-    outs[0] = a / c_conj;
-    outs[1] = -a * out_div_c_conj;
-    return outs;
-  }
-};
-
-// Float div grad
-template <typename T>
-struct DivGradXFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-
-// Complex div grad
-template <typename T>
-struct DivGradXFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a / b_conj;
-  }
-};
-
-// Float mul and div
-template <typename T>
-struct DivGradYFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
-    return -a * b / c;
-  }
-};
-
-// Complex mul and div
-template <typename T>
-struct DivGradYFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b,
-                                          const Complex<T> c) const {
-    Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
-    return -a * out_div_c_conj;
-  }
-};
-
 // Fmax
 template <typename T>
 struct FMaxFunctor {
@@ -257,47 +196,6 @@ struct MinGradXYFunctor {
   }
 };
 
-template <typename T>
-struct MulGradFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
-};
-template <typename T>
-struct MulGradFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a * b_conj;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * y
-    outs[0] = a * b;
-    // dy = dout * x
-    outs[1] = a * c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    // dx = dout * y
-    Complex<InT> b_conj(b.real, -b.imag);
-    outs[0] = a * b_conj;
-    // dy = dout * x
-    Complex<InT> c_conj(c.real, -c.imag);
-    outs[1] = a * c_conj;
-    return outs;
-  }
-};
-
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index e172279145e28c0731ed0d8d91769d0b293662fe..830e09eeae4811eb44bd4e21e17fe83ee44c592d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL(
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 45c87a27a180af4798a9f8b31e2edfd0cacb583d..f7b9fd1e265f5d3f107e734f9ffdcc90e7f6cc77 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -63,33 +63,6 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y, x};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, MulGradFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, x};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dy, MulGradFunctor<T>());
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c81266d584468f51030026e1423a649252001f58..58a3123c7e332f50b0830577436528f1e8df1cdf 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-template <typename T>
-struct MulGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-};
-
-template <typename T>
-struct MulGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <typename T>
-struct MulGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-};
-
-template <typename T>
-struct MulGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    // ddout = ddx * y + x * ddy
-    // change computation sequence to save memory, so ddout can inplace ddx and
-    // dx can be used as 'tmp' tensor
-    // (1) dx = x * ddy
-    // (2) dy = dout * ddx
-    // (3) ddout = ddx * y
-    // (4) ddout = ddout + dx
-    // (5) dx = dout * ddy
-    if (ddout) {
-      int axis = ctx.Attr<int>("axis");
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
-      if (ddout->numel() > ddx->numel()) {
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX<T>(),
-            MulGradDY<T>());
-
-        Tensor ddout_tmp;
-        ddout_tmp.mutable_data<T>(ddout->dims(), ctx.GetPlace());
-
-        default_elementwise_mul<DeviceContext, T>(ctx, y, &ddx_safe, ddout);
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, x,
-                                                  &ddout_tmp);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      } else {
-        // use dx to save memory, other than alloc tmp tensor
-        Tensor* ddout_tmp = dx;
-
-        default_elementwise_mul<DeviceContext, T>(ctx, x, &ddy_safe, ddout_tmp);
-        // NOTE: in the following ElemwiseGradCompute, for the
-        // first output tensor is nullptr, the branch to calculate first
-        // output tensor will not be activated, DivGradDx function will not
-        // be called and can be ignored, the first branch has little effect
-        // on running speed.
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy,
-            MulGradDX<T>(), MulGradDY<T>());
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, y, ddout);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(*ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-        default_elementwise_mul<DeviceContext, T>(ctx, dout, &ddy_safe, dx);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    // get input
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>("DOut");
-    auto* ddx = ctx.Input<framework::Tensor>("DDX");
-    auto* ddy = ctx.Input<framework::Tensor>("DDY");
-
-    auto* d_dx = ctx.Input<framework::Tensor>("D_DX");
-    auto* d_dy = ctx.Input<framework::Tensor>("D_DY");
-    auto* d_ddout = ctx.Input<framework::Tensor>("D_DDOut");
-
-    // get output
-    auto* out_d_x = ctx.Output<framework::Tensor>("D_X");
-    auto* out_d_y = ctx.Output<framework::Tensor>("D_Y");
-    auto* out_d_dout = ctx.Output<framework::Tensor>("D_DOut");
-
-    auto* out_d_ddx = ctx.Output<framework::Tensor>("D_DDX");
-    auto* out_d_ddy = ctx.Output<framework::Tensor>("D_DDY");
-
-    if (out_d_x) out_d_x->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_y) out_d_y->mutable_data<T>(y->dims(), ctx.GetPlace());
-    if (out_d_dout) out_d_dout->mutable_data<T>(dout->dims(), ctx.GetPlace());
-    if (out_d_ddx) out_d_ddx->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_ddy) out_d_ddy->mutable_data<T>(y->dims(), ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    if (d_ddout) {
-      if (out_d_x) {
-        // out_d_x = ddy * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_ddout,
-                                                  out_d_x);
-      }
-      if (out_d_y) {
-        // out_d_y = ddx * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, d_ddout,
-                                                  out_d_y);
-      }
-    }
-
-    if (out_d_dout) {
-      // get out_d_dout
-      // out_d_dout = ddy * d_dx + d_dy * ddx
-      Tensor out_d_dout_tmp;
-      out_d_dout_tmp.mutable_data<T>(dout->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, d_dy, &ddx_safe,
-                                                out_d_dout);
-      default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_dx,
-                                                &out_d_dout_tmp);
-      auto out_d_dout_t = framework::EigenVector<T>::Flatten(*out_d_dout);
-      auto out_d_dout_tmp_t =
-          framework::EigenVector<T>::Flatten(out_d_dout_tmp);
-      out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t;
-    }
-
-    if (out_d_ddx) {
-      // get out_d_ddx
-      // out_d_ddx = dout * d_dy + y * d_ddout
-      Tensor out_d_ddx_tmp;
-      out_d_ddx_tmp.mutable_data<T>(ddx->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dy, out_d_ddx);
-      default_elementwise_mul<DeviceContext, T>(ctx, y, d_ddout,
-                                                &out_d_ddx_tmp);
-      auto out_d_ddx_t = framework::EigenVector<T>::Flatten(*out_d_ddx);
-      auto out_d_ddx_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddx_tmp);
-      out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t;
-    }
-
-    if (out_d_ddy) {
-      // get out_d_ddy
-      // out_d_ddy = dout * d_dx + x * d_ddout
-      Tensor out_d_ddy_tmp;
-      out_d_ddy_tmp.mutable_data<T>(ddy->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dx, out_d_ddy);
-      default_elementwise_mul<DeviceContext, T>(ctx, x, d_ddout,
-                                                &out_d_ddy_tmp);
-      auto out_d_ddy_t = framework::EigenVector<T>::Flatten(*out_d_ddy);
-      auto out_d_ddy_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddy_tmp);
-      out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t;
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 418779c32e8bc216be1532bf714bc21d91c452aa..102127e6ffe4ea60b8305c718e645a3695557ae4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -16,9 +16,6 @@
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/phi/kernels/gpu/elementwise.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index a1a7f8310986616d0a9f7db572ed31ca44399027..80b07721f0b4d1feb669bfce91127b0887d79391 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,6 +31,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -44,6 +45,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
 #endif
 
@@ -133,7 +135,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
 
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
-  return phi::funcs::trim_trailing_singular_dims(dims);
+  return phi::funcs::TrimTrailingSingularDims(dims);
 }
 
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
@@ -144,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                          const framework::Tensor &dout, int axis,
                          framework::Tensor *dx, framework::Tensor *dy,
                          DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
-                                               Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  } else {
-    phi::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  }
+  phi::funcs::ElemwiseGradCompute<DeviceContext, T, DX_OP, DY_OP, Tout>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
 }
 
 // It is a common implementation to compute binary calculation with the support
@@ -173,19 +167,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                           const framework::Tensor *y, int axis, Functor func,
                           framework::Tensor *z) {
   z->mutable_data<OutType>(ctx.GetPlace());
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func,
-                                                 z);
-
-#endif
-    return;
-  }
-  const auto &dev_ctx =
-      ctx.template device_context<platform::CPUDeviceContext>();
-  phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func, z);
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+  phi::funcs::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis,
+                                                      func, z);
 }
 
 // FusedElemwiseAndAct
@@ -443,8 +427,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                           &is_run_common_broadcast);
+  phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
+                         &is_run_common_broadcast);
   if (post == 1) {
     int h = pre;
     int w = n;
@@ -991,8 +975,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                           &is_run_common_broadcast);
+  phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
+                         &is_run_common_broadcast);
   const T *x_data = nullptr;
   const T *y_data = nullptr;
   if (x->IsInitialized()) x_data = x->data<T>();
@@ -1183,14 +1167,6 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
 }
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
-                   framework::Tensor *src, framework::Tensor *dst) {
-  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
-  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
-      dev_ctx.stream());
-}
 
 template <ElementwiseType ET, typename T, typename Functor>
 void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
@@ -1198,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
                      std::vector<const framework::Tensor *> ins,
                      const framework::Tensor *dout, framework::Tensor *dx,
                      framework::Tensor *dy, Functor func) {
-  framework::Tensor tmp_dx;
-  framework::Tensor tmp_dy;
-  dx->mutable_data<T>(place);
-  dy->mutable_data<T>(place);
-  std::vector<framework::Tensor *> outs;
-  if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
-    outs = {dx, dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, dy};
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    outs = {dx, &tmp_dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, &tmp_dy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(
-      dev_ctx, ins, &outs, axis, func);
-
-  if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  }
+  phi::GetGradXAndYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dx, dy,
+                                       func);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -1236,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
                     std::vector<const framework::Tensor *> ins,
                     const framework::Tensor *dout, framework::Tensor *dxy,
                     Functor func) {
-  framework::Tensor tmp_dxy;
-  dxy->mutable_data<T>(place);
-
-  std::vector<framework::Tensor *> outs;
-  if (dxy->dims() != dout->dims()) {
-    tmp_dxy.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dxy};
-  } else {
-    outs = {dxy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs,
-                                                           axis, func);
-  if (dxy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
-  }
+  phi::GetGradXOrYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dxy,
+                                      func);
 }
 
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 7d7bb4f26fcf42ec63cd1fab7ec2667a03c8ba4c..f49e2ab4e173efbd2cb8a33ec3e7471faff11154 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 1f8a95f0286bd3bb228bcda59e1198bf0763eb9a..3e9263fe93acd93638ff9e496203b7ea432cea86 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -33,7 +32,7 @@ namespace p = paddle::platform;
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP(elementwise_sub);
+USE_OP_ITSELF(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
index 14b20baae1b0398a40ee74a3e16c2c992a4b557e..78855dd39572539e531bcd8ad3786ae95269ca8f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index b2cef95d1a349d66161db1c3edf7c14bc8a6d058..d15a7c272757fa683f835215e3db9ccec956af38 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -78,10 +76,16 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub);
 
 namespace ops = paddle::operators;
 
+REGISTER_OPERATOR(elementwise_sub, ::paddle::operators::ElementwiseOp,
+                  ::paddle::operators::ElementwiseSubOpMaker,
+                  ::paddle::operators::ElementwiseOpInferVarType,
+                  elementwise_subGradMaker<::paddle::framework::OpDesc>,
+                  elementwise_subGradMaker<::paddle::imperative::OpBase>,
+                  ::paddle::operators::ElementwiseOpInplaceInferer);
+
 REGISTER_OPERATOR(
     elementwise_sub_grad, ops::ElementwiseOpGrad,
     ops::ElementwiseGradOpInplaceInferer, ops::ElementwiseGradNoBufVarsInferer,
@@ -92,51 +96,6 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad,
                   ops::ElementwiseDoubleGradOpInplaceInferer,
                   ops::ElementwiseDoubleGradNoBufVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int16_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_sub)
     .AddCheckpoint(
         R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
deleted file mode 100644
index 2c962af9877b978f7a6af25635f345c0ae5ffd27..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
deleted file mode 100644
index 15c547b493ae045c13ab8d6b14a646cb92716a92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    phi::SubtractRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, *y, axis, z);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    phi::SubtractGradKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, *y, *dout, axis, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-    int axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    paddle::optional<const phi::DenseTensor&> ddx_optional = paddle::none;
-    paddle::optional<const phi::DenseTensor&> ddy_optional = paddle::none;
-    if (ddx != nullptr) {
-      ddx_optional = *ddx;
-    }
-    if (ddy != nullptr) {
-      ddy_optional = *ddy;
-    }
-    phi::SubtractDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *y, ddx_optional, ddy_optional, *dout, axis, ddout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index b68d38d6df12a5d11f57b1556f8fc7ceec00d3e0..4169a938f2d0bff0cf8b23db35c943c9ff586212 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
index d12c6fc30cebaafd27c099ab708e0662477cb017..87c494b0e10bad64566b5248946c9b8b1b778f2f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "xpu/refactor/math.h"
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index 5222103256d614a2d6b1fa10662367ecb20d3cb2..ea009a38056f078689bd6dc4c9a41d2b34e8c1fa 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -17,8 +17,13 @@
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
+PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 9d4d11609ac2047aa8934cb2868f79359a816e12..ce5c6b701d95894db8e3a84215f537352914706a 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -21,9 +21,12 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 9aa206efed8c0111f56b6651e0228acc316b1bfe..3cecc52a3c481cf9cb4a1e2eba6ded704a8fa8ee 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -27,8 +27,14 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
+
+PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index e23342ebb5dc7639d68500964bfdfbd099d077cd..9e0e4e7fe1c6d26df7c4347d8bc81a985e6c973b 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -51,46 +53,6 @@ class EmptyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* context) const override {
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
-
-    if (context->HasInput("ShapeTensor")) {
-      auto shape_dims = context->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      context->SetOutputDim("Out", phi::make_ddim(vec_dims));
-    } else if (context->HasInputs("ShapeTensorList")) {
-      std::vector<int> out_dims;
-      auto dims_list = context->GetInputsDim("ShapeTensorList");
-      for (size_t i = 0; i < dims_list.size(); ++i) {
-        auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(dims, phi::make_ddim({1}),
-                          platform::errors::InvalidArgument(
-                              "The shape of Tensor in list must be [1]. "
-                              "But received the shape is [%s]",
-                              dims));
-
-        out_dims.push_back(-1);
-      }
-
-      context->SetOutputDim("Out", phi::make_ddim(out_dims));
-    } else {
-      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
-      for (size_t i = 0; i < shape.size(); ++i) {
-        PADDLE_ENFORCE_GE(
-            shape[i], 0,
-            platform::errors::InvalidArgument(
-                "Each value of attribute 'shape' is expected to be no less "
-                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
-                i, shape[i], phi::make_ddim(shape)));
-      }
-      context->SetOutputDim("Out", phi::make_ddim(shape));
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const framework::Tensor& tensor,
@@ -126,14 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OPERATOR(
-    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
+DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
+                            PD_INFER_META(phi::CreateInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker,
+                             ops::EmptyOpVarTypeInference,
+                             EmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc
deleted file mode 100644
index 22799e507aeff7940274f729b174f50bfd9132a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/empty_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/empty_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
-    ops::EmptyKernel<plat::CUDADeviceContext, int>,
-    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
-    ops::EmptyKernel<plat::CUDADeviceContext, float>,
-    ops::EmptyKernel<plat::CUDADeviceContext, double>,
-    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
deleted file mode 100644
index cb466fffcd7c7358b6e84c18b7895a17b2eaa907..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/empty_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EmptyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor *out_tensor = context.Output<Tensor>("Out");
-
-    auto shape = GetShape(context);
-    out_tensor->Resize(shape);
-
-    out_tensor->mutable_data(context.GetPlace(),
-                             framework::TransToPhiDataType(dtype));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc
index f68f670394871114369f8b05b7f958c03d5508d0..64274d098c0585c28196743c09d5e6c78c3fe37d 100644
--- a/paddle/fluid/operators/erf_op.cc
+++ b/paddle/fluid/operators/erf_op.cc
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/erf_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel {
         const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of ErfOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of ErfOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker,
                   ops::ErfGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ErfGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ErfGradOpMaker<paddle::imperative::OpBase>,
+                  ErfInferShapeFunctor);
 REGISTER_OPERATOR(erf_grad, ops::ErfGradOp);
-REGISTER_OP_CPU_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CPU_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h
deleted file mode 100644
index 4780b2e7f5b28d4a743f6d35046891b30cbefd00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/erf_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ErfKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErf<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
-                                                     eigen_in);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ErfGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(place, eigen_dx,
-                                                         eigen_x, eigen_dout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
index 3d409b4c4f6772bc7b234208e78c5088eeb2fc00..374b00792622f91edc0b66cebb278cc79f30dc66 100644
--- a/paddle/fluid/operators/erfinv_op.cc
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(
     erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 119e514a49e28fb3295e36947664770889bbdd81..97a35a34f23e96707269482e29da13a15538cdca 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -121,37 +121,9 @@ REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
-#endif
 
 REGISTER_OP_VERSION(expand_as_v2)
     .AddCheckpoint(
         R"ROC(fix expand_as_v2 and add new input [Y])ROC",
         paddle::framework::compatible::OpVersionDesc().NewInput(
-            "Y", "Expand X according to the shape of Y"));
\ No newline at end of file
+            "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index d7560efc5c1f1244ae4eed4c68c59a38287057ee..f09e7764eed3959c7f0ca700b953dbd0c2891d12 100755
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -32,219 +32,5 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T>
-class ExpandAsV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank, rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
-    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank, MAX_RANK_SUPPORTED));
-
-    switch (target_rank) {
-      case 1:
-        ExpandAs<1>(context);
-        break;
-      case 2:
-        ExpandAs<2>(context);
-        break;
-      case 3:
-        ExpandAs<3>(context);
-        break;
-      case 4:
-        ExpandAs<4>(context);
-        break;
-      case 5:
-        ExpandAs<5>(context);
-        break;
-      case 6:
-        ExpandAs<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (i < diff) {
-        PADDLE_ENFORCE_GT(
-            target_shape[i], 0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_as_v2 op.",
-                target_shape[i]));
-        repeat_times[i] = target_shape[i];
-      } else if (target_shape[i] > 0) {
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i], target_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_as_v2 op.",
-                  vec_in_dims[i], target_shape[i]));
-          repeat_times[i] = 1;
-        } else {
-          repeat_times[i] = target_shape[i];
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(
-            target_shape[i], -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_as_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                target_shape[i]));
-        repeat_times[i] = 1;
-      }
-    }
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                 bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto x_dims = in0->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      repeat_times[i] = target_shape[i] / vec_in_dims[i];
-    }
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
-                            out0);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be greater than or "
-                            "equal to 1, but the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandAsBackward(const framework::ExecutionContext& context,
-                        const std::vector<int>& reshape_dims_vec,
-                        const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index cdd4e1dbaae6a6a74bb11be44589877234021764..df00ae54c1036b1b0f0899eb0a949d58c398aa48 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc
index ee456dcdafbc51d547e7beacc4e4e79f98738b88..1a48a6767852e138e7725a68ca4ffc56de8234be 100644
--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -76,7 +76,7 @@ class ExponentialKernel<platform::CPUDeviceContext, T>
     auto engine = gen->GetCPUEngine();
 
     std::uniform_real_distribution<T> uniform(0.0, 1.0);
-    distribution::exponential_transform<T> trans(lambda);
+    phi::funcs::exponential_transform<T> trans(lambda);
     for (int64_t i = 0; i < size; ++i) {
       out_data[i] = trans(uniform(*engine));
     }
diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu
index 8b989501e4f4248b0c2e3b23e1e75a4865b08588..d5abbf9a26afe6bcbbd8549f59d632fc4e53fec2 100644
--- a/paddle/fluid/operators/exponential_op.cu
+++ b/paddle/fluid/operators/exponential_op.cu
@@ -26,9 +26,9 @@ class ExponentialKernel<platform::CUDADeviceContext, T>
     auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
     T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
 
-    distribution::uniform_distribution<T> dist;
-    distribution::exponential_transform<T> trans(lambda);
-    distribution::distribution_and_transform<T>(dev_cxt, out, dist, trans);
+    phi::funcs::uniform_distribution<T> dist;
+    phi::funcs::exponential_transform<T> trans(lambda);
+    phi::funcs::distribution_and_transform<T>(dev_cxt, out, dist, trans);
   }
 };
 
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
index fbcabc594db0814da1ec50934a0f02514dc208be..7ded174a9f47ede48a49b19b25539867ce344fb0 100644
--- a/paddle/fluid/operators/exponential_op.h
+++ b/paddle/fluid/operators/exponential_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 8f8a0f174a79f13f0bee7aa7b425f8c645e15687..537c218d357b67980216ab3053707b8adb867c01 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,24 +24,6 @@ class EyeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of EyeOP should not be null."));
-    auto num_rows = ctx->Attrs().Get<int64_t>("num_rows");
-    PADDLE_ENFORCE_EQ(
-        num_rows >= 0, true,
-        platform::errors::InvalidArgument(
-            "The value of Input(num_rows) should be non-negative int."));
-    auto num_columns = ctx->Attrs().Get<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-    PADDLE_ENFORCE_EQ(
-        num_columns >= 0, true,
-        platform::errors::InvalidArgument(
-            "The value of Input(num_columns) should be non-negative int."));
-    ctx->SetOutputDim("Out", {num_rows, num_columns});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -82,8 +67,11 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
+                            PD_INFER_META(phi::EyeInferMeta));
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    EyeInferShapeFunctor);
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 0eb84f18f25f03b1fd0310c5815ee342ff835a6f..27a235765227f15dd412dcd6ad55f2a24471c6da 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/attn_feed_forward.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,6 +30,11 @@ namespace platform = paddle::platform;
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+#endif
+
 // get paddle matmul op results as baseline
 template <typename T>
 void GetLinearOp(const std::vector<T> &x, const std::vector<T> &y,
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 79018f2a97448a8c6265a969dad37bce77d1b7ee..cb03add3143278260d41c3893e7adad976908d4e 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
-#if (CANN_VERSION_CODE >= 503003)
+#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001)
       runner.SetType("FillD")
           .AddInput(tensor_value)
           .AddOutput(*out_var)
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..508730c3c7335dbad8cf70417d2c19be4a8480a2
--- /dev/null
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -0,0 +1,655 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
+
+#if defined(PADDLE_WITH_CUDA)
+#include <cooperative_groups.h>
+#endif
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <cstring>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/operators/filter_by_instag_op.h"
+
+#if defined(PADDLE_WITH_CUDA)
+namespace cg = cooperative_groups;
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+using Vector = framework::Vector<T>;
+
+#define WARP_SIZE 32
+#define MAX_WARP_NUM 32
+
+#if defined(PADDLE_WITH_CUDA)
+
+template <typename T>
+__global__ void filter_copy_fuse_kernel(
+    const size_t N, const int ins_per_thread, size_t* x1_lods_data,
+    size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data,
+    int64_t filter_tag_size, T* out_data, int64_t* map_data,
+    size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data,
+    const T* x1_data, int x1_embed_size, float* loss_weight_data,
+    float fill_value) {
+  // N is instance num
+  // one threads for ins_per_thread instances
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cg::thread_block b = cg::this_thread_block();
+  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+  int gid = idx / WARP_SIZE;
+
+  // general use
+  int thread_num =
+      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
+  int total_warp_num = thread_num / WARP_SIZE;      // 30
+  int remain_thread_num = thread_num % WARP_SIZE;   // 16
+
+  int warp_thread_num = -1;
+  if (gid < total_warp_num) {
+    warp_thread_num = WARP_SIZE;
+  } else {
+    warp_thread_num = remain_thread_num;
+  }
+
+  int group_num = total_warp_num;
+  if (remain_thread_num > 0) {
+    group_num = total_warp_num + 1;
+  }
+
+  if (gid >= group_num) return;
+
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (N < ins_end) ins_end = N;
+
+  /*
+    if (!x1_lods_filled) {
+      for (int p = ins_start; p < ins_end; p++) {
+        x1_lods_data[p] = p;
+      }
+      if (idx == 0) {
+        x1_lods_data[N] = N;
+      }
+    }
+
+    if (!x2_lods_filled) {
+      for (int p = ins_start; p < ins_end; p++) {
+        x2_lods_data[p] = p;
+      }
+      if (idx == 0) {
+        x2_lods_data[N] = N;
+      }
+    }
+
+    if (!x1_lods_filled || !x2_lods_filled) {
+      b.sync();
+    }
+  */
+
+  int flag_data[5];
+  int prefix_sum_data[5];
+  int prefix_sum_data2[5];
+
+  __shared__ int shr[MAX_WARP_NUM];
+  __shared__ int shr2[MAX_WARP_NUM];
+  __shared__ int shr3[MAX_WARP_NUM];
+
+  for (int p = ins_start; p < ins_end; p++) {
+    int ins_tag_start = x2_lods_data[p];
+    int ins_tag_end = x2_lods_data[p + 1];
+    flag_data[p - ins_start] = 0;
+    // filter logic
+    int i = ins_tag_start;
+    for (; i < ins_tag_end; i++) {
+      int64_t ins_tag = x2_data[i];
+      int j = 0;
+      for (; j < filter_tag_size; j++) {
+        if (x3_data[j] == ins_tag) break;
+      }
+      // if ins_tag in filter tag
+      if (j < filter_tag_size) {
+        flag_data[p - ins_start] = 1;
+        break;
+      }
+    }
+  }
+
+  int sum_addr = 0;
+  int sum_flag = 0;
+  int sum_out_lods = 0;
+
+  int local_addr = 0;
+  int local_flag = 0;
+  int local_out_lods = 0;
+
+  if (ins_start < ins_end) {
+    for (int p = ins_start; p < ins_end; p++) {
+      int previous = -1;
+      if (p == ins_start) {
+        previous = 0;
+      } else {
+        previous = prefix_sum_data[p - ins_start - 1];
+      }
+
+      prefix_sum_data[p - ins_start] =
+          previous +
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
+    sum_addr = local_addr;
+
+    // flag
+    // local_flag = 0;
+    for (int p = ins_start; p < ins_end; p++) {
+      local_flag += flag_data[p - ins_start];
+    }
+    sum_flag = local_flag;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_out_lods +=
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    sum_out_lods = local_out_lods;
+  }
+
+  // 32 threads
+  for (int i = 1; i < warp_thread_num; i *= 2) {
+    int temp_addr = g.shfl_up(sum_addr, i);
+    int temp_flag = g.shfl_up(sum_flag, i);
+    int temp_out_lods = g.shfl_up(sum_out_lods, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr += temp_addr;
+      sum_flag += temp_flag;
+      sum_out_lods += temp_out_lods;
+    }
+  }
+
+  if (g.thread_rank() == warp_thread_num - 1) {
+    shr[gid] = sum_addr;
+    shr2[gid] = sum_flag;
+    shr3[gid] = sum_out_lods;
+  }
+
+  b.sync();
+
+  int sum_addr2 = 0;
+  int sum_flag2 = 0;
+  int sum_out_lods2 = 0;
+
+  // communicate between warp
+  if (g.thread_rank() < group_num) {
+    sum_addr2 = shr[g.thread_rank()];
+    sum_flag2 = shr2[g.thread_rank()];
+    sum_out_lods2 = shr3[g.thread_rank()];
+  }
+
+  for (int i = 1; i < group_num; i *= 2) {
+    int temp_addr2 = g.shfl_up(sum_addr2, i);
+    int temp_flag2 = g.shfl_up(sum_flag2, i);
+    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr2 += temp_addr2;
+      sum_flag2 += temp_flag2;
+      sum_out_lods2 += temp_out_lods2;
+    }
+  }
+
+  int sum_addr3 = g.shfl(sum_addr2, gid);
+  int sum_flag3 = g.shfl(sum_flag2, gid);
+  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
+
+  int p_flag;
+  int p_addr;
+  int p_out_lods;
+
+  if (ins_start < ins_end) {
+    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
+    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
+    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      if (ins_start == p) {
+        prefix_sum_data2[p - ins_start] = p_addr;
+      } else {
+        prefix_sum_data2[p - ins_start] =
+            prefix_sum_data2[p - ins_start - 1] +
+            flag_data[p - ins_start - 1] *
+                (x1_lods_data[p] - x1_lods_data[p - 1]);
+      }
+    }
+
+    if (gid == 0 && g.thread_rank() == group_num - 1) {
+      *out_idx_data = (sum_flag2 + 1);
+      map_lods_data[sum_flag2] = sum_flag2;
+    }
+  }
+
+  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
+
+  if (ins_start < ins_end) {
+    int out_lods_idx = p_flag + 1;
+
+    // ins_start = 1
+    // BUG fix
+    for (int p = ins_start; p < ins_end; p++) {
+      if (flag_data[p - ins_start] == 1) {
+        // batch_len = 2
+        // batch_len = 4
+        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
+        // t = 0
+        // t = 1
+        int t = out_lods_idx - 1;
+        // out_lods_data[0] = 0;
+        int previous;
+
+        if (out_lods_idx == p_flag + 1) {
+          // out_lods_data[t] = p_out_lods;
+          previous = p_out_lods;
+        } else {
+          previous = out_lods_data[t];
+        }
+
+        map_data[t * 3] = (int64_t)previous;
+        map_data[t * 3 + 1] = x1_lods_data[p];
+        map_lods_data[t] = t;
+        out_lods_data[out_lods_idx] = previous + batch_len;
+        map_data[t * 3 + 2] = batch_len;
+        out_lods_idx++;
+      }
+    }
+
+    // fill loss_weight_data
+    if (sum_out_lods4 > 1) {
+      int out_data_num = sum_out_lods4 - 1;
+      int out_start = ins_start;
+
+      if (out_start < out_data_num) {
+        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
+        for (int p = out_start; p < out_end; p++) {
+          loss_weight_data[p] = fill_value;
+        }
+      }
+    }
+
+    for (int p = ins_start; p < ins_end; p++) {
+      // copy logic
+      if (flag_data[p - ins_start] == 1) {
+        auto output_start_idx = prefix_sum_data2[p - ins_start];
+        T* dst = out_data + output_start_idx * x1_embed_size;
+
+        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
+        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
+
+        // optimized
+        for (const T *j = src_start; j != src_end; dst++, j++) {
+          *dst = *j;
+        }
+      }
+    }
+  }
+
+  b.sync();
+}
+
+template <typename T>
+__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
+                                 const T* out_grad_data, T* x1_grad_data,
+                                 const int64_t* map_data, int x1_embed_size) {
+  // N is instance num
+  // one threads for one instance
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (ins_start >= N) {
+    return;
+  }
+  if (ins_end > N) ins_end = N;
+
+  for (int p = ins_start; p < ins_end; p++) {
+    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
+    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
+    const T* src_end =
+        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
+
+    for (const T *j = src_start; j != src_end; dst++, j++) {
+      *dst = *j;
+    }
+  }
+}
+
+#endif
+
+template <typename T>
+class FilterByInstagGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+
+    int max_thread_num_per_block = 1024;
+    //    context.cuda_device_context().GetMaxThreadsPerBlock();
+    // X1 is global FC output
+    // Dim [batch size, embedding size]
+    const LoDTensor* x1 = context.Input<LoDTensor>("Ins");
+    bool is_lod = context.Attr<bool>("is_lod");
+
+    int is_x1_lod = -1;
+    if (is_lod)
+      is_x1_lod = 1;
+    else
+      is_x1_lod = 0;
+
+    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
+    size_t x1_embed_size = x1->dims()[1];
+    // X2 is ins tag list
+    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
+    const LoDTensor* x2 = context.Input<LoDTensor>("Ins_tag");
+    // expected auto = const int64_t
+    const int64_t* x2_data = x2->data<int64_t>();
+
+    // X3 is local fc tag list
+    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
+    const Tensor* x3 = context.Input<Tensor>("Filter_tag");
+    const int64_t* x3_data = x3->data<int64_t>();
+
+    // int x2_lods_filled = 1;
+
+    Vector<size_t> x2_lods;
+    // Vector, in GPU
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+      // x2_lods_filled = 1;
+
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      // x2_lods.resize(x2->dims()[0] + 1);
+      // move to cuda
+      x2_lods.push_back(0);
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(i + 1);
+      }
+    }
+
+    const size_t x2_lods_size = x2_lods.size() - 1;
+    paddle::framework::MixVector<size_t> mixv_x2_lods(&x2_lods);
+
+    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
+
+    // Vector, in GPU
+    // int x1_lods_filled = 1;
+    Vector<size_t> x1_lods;
+
+    if (!is_x1_lod) {
+      // move to cuda
+      // x1_lods.resize(x1->dims()[0] + 1);
+      x1_lods.push_back(0);
+      for (int i = 0; i < x1->dims()[0]; i++) {
+        x1_lods.push_back(i + 1);
+      }
+    } else {
+      // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {  // lod_level = 1
+        // x1_lods_filled = 1;
+        x1_lods = x1->lod()[0];
+      } else {  // lod_level = 0
+        // x1_lods.resize(x1->dims()[0] + 1);
+        // move to cuda
+        x1_lods.push_back(0);
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
+    }
+
+    paddle::framework::MixVector<size_t> mixv_x1_lods(&x1_lods);
+
+    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
+    auto* x1_data = x1->data<T>();
+
+    // set output value
+    // for those whose ins been dropout, set 0 for whole lines.
+    // otherwise, copy whole line
+    // Dim [local fc count, batch size, embedding size]
+    LoDTensor* out = context.Output<LoDTensor>("Out");
+    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
+    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
+
+    int out_first = x1_lods.back();
+    // int out_first = x1->dims()[0];
+    // if (x1_lods_filled) {
+    //  out_first = x1_lods.back();
+    // }
+
+    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
+    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
+    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
+
+    T* out_data = out->mutable_data<T>(gpu_place);
+    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
+    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
+
+    int block_size = max_thread_num_per_block;
+    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
+    dim3 block_dim(block_size);
+    dim3 grid_dim(1);
+
+    Vector<size_t> out_lods(x2_lods_size + 1, 0);
+    Vector<size_t> map_lods(x2_lods_size + 1, 0);
+
+    paddle::framework::MixVector<size_t> mixv_out_lods(&out_lods);
+    paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+
+    // thrust::device_vector<size_t> out_idx(1);
+    Vector<size_t> out_idx(1, 0);
+    paddle::framework::MixVector<size_t> mixv_out_idx(&out_idx);
+
+    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
+    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
+    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
+
+    float fill_value = 1.0;
+
+    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+        x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data,
+        x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data,
+        out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value);
+
+    platform::GpuStreamSync(current_stream);
+
+    mixv_out_lods.resize(mixv_out_idx[0]);
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      out->Resize(phi::make_ddim(
+          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
+
+      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
+      loss_weight->Resize(
+          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
+
+    } else {
+      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
+      map->Resize(phi::make_ddim({1, 3}));
+      loss_weight->Resize(phi::make_ddim({1, 1}));
+    }
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      map_lods.resize(mixv_out_lods.size());
+
+      mixv_map_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+
+      map->set_lod(map_lod_info);
+      loss_weight->set_lod(map_lod_info);
+
+      mixv_out_lods.CopyToCPU();
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+    } else {
+      Vector<size_t> map_lods(2, 0);
+      paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+      thrust::device_ptr<int64_t> map_data_ptr(map_data);
+
+      map_data_ptr[0] = 0;
+      map_data_ptr[1] = 1;
+      map_data_ptr[2] = 1;
+
+      mixv_map_lods[0] = 0;
+      mixv_map_lods[1] = 1;
+      mixv_out_lods.push_back(1);
+
+      mixv_map_lods.CopyToCPU();
+      mixv_out_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+      map->set_lod(map_lod_info);
+
+      loss_weight->set_lod(map_lod_info);
+
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+      thrust::device_ptr<T> out_data_ptr(out_data);
+
+      // gpu kernel
+      if (std::is_same<T, int32_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int32_t>(out_val_if_empty));
+      } else if (std::is_same<T, int64_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int64_t>(out_val_if_empty));
+      } else if (std::is_same<T, float>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<float>(out_val_if_empty));
+      } else {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<double>(out_val_if_empty));
+      }
+
+      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
+      loss_weight_data_ptr[0] = 0;
+    }
+
+#endif
+  }
+};
+
+template <typename T>
+class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+    auto max_thread_num_per_block = 1024;
+    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
+    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
+    auto* mmap = context.Input<LoDTensor>("IndexMap");
+    auto* x1 = context.Input<LoDTensor>("Ins");
+
+    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
+    x1_grad->Resize(x1->dims());
+
+    auto* mmap_data = mmap->data<int64_t>();
+    // expected auto = T
+    auto* output_grad_data = output_grad->data<T>();
+    auto* loss_weight_data = loss_weight->data<float>();
+
+    // expected auto = T
+    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
+    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
+    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
+
+    thrust::fill(x1_grad_data_ptr,
+                 x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
+
+    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
+      auto output_dims = output_grad->dims();
+      int x1_embed_size = output_dims[1];
+
+      // one thread for multi-instances
+      int block_size = max_thread_num_per_block;
+
+      size_t N = mmap->dims()[0];
+      dim3 block_dim(block_size);
+
+      dim3 grid_dim((N + block_size - 1) / block_size);
+
+      const int ins_per_thread = 1;
+
+      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+          N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data,
+          x1_embed_size);
+
+      cudaStreamSynchronize(current_stream);
+    }
+
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel<float>,
+                        ops::FilterByInstagGPUKernel<double>,
+                        ops::FilterByInstagGPUKernel<int32_t>,
+                        ops::FilterByInstagGPUKernel<int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad,
+                        ops::FilterByInstagGradGPUKernel<float>,
+                        ops::FilterByInstagGradGPUKernel<double>,
+                        ops::FilterByInstagGradGPUKernel<int32_t>,
+                        ops::FilterByInstagGradGPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index deb2aa96b539e360cf2edad97b21cb6e9ddba066..3abc980ceaafc3719c13cad51c346282be2c694f 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    size_t x2_lods_size = x2->dims()[0];
+    // size_t x2_lods_size = x2->dims()[0];
+    // size_t instag_num_per_ins = x2->dims()[1];
+
+    Vector<size_t> x2_lods(1, 0);
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_num_per_ins = x2->dims()[1];
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
+      }
+    }
+
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
@@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods_size; i++) {
-      for (size_t j = i; j < i + 1; j++) {
+    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
+      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];
@@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
           out_data[oi] = (int32_t)out_val_if_empty;
         } else if (std::is_same<T, int64_t>::value) {
           out_data[oi] = (int64_t)out_val_if_empty;
-        } else {
+        } else if (std::is_same<T, double>::value) {
           out_data[oi] = static_cast<double>(out_val_if_empty);
+        } else {
+          out_data[oi] = static_cast<float>(out_val_if_empty);
         }
       }
       loss_weight_data[0] = 0;
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 40ec9aef190ff4bacd52b19a1c0b12300a35b61e..92f59e118c3b7bb66a2c5c76d66109ddf04ee076 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel {
                           "but recieved strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
+    PADDLE_ENFORCE_GT(output_height, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_height` should be greater than one, "
+                          "but recieved output_height: %d .",
+                          output_height));
+    PADDLE_ENFORCE_GT(output_width, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_width` should be greater than one, "
+                          "but recieved output_width: %d .",
+                          output_width));
+    // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
@@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel {
             output_width));
 
     PADDLE_ENFORCE_EQ(
-        blocks_height * blocks_width, in_dims[1],
+        blocks_height * blocks_width, in_dims[2],
         platform::errors::InvalidArgument(
             "Given input output_size (%d, %d), "
             "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
@@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel {
             strides[0], strides[1], dilations[0], dilations[1], blocks_height,
             blocks_width, blocks_height * blocks_width, in_dims[2]));
 
+    PADDLE_ENFORCE_EQ(
+        in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0,
+        platform::errors::InvalidArgument(
+            "Expected size of input's dimension 1 to be divisible by the"
+            "product of kernel_size, but got input.size(1)=%d and "
+            "kernel_size=( %d"
+            ", %d).",
+            in_dims[1], kernel_sizes[0], kernel_sizes[1]));
+
     out_dims.push_back(output_height);
     out_dims.push_back(output_width);
     ctx->SetOutputDim("Y", phi::make_ddim(out_dims));
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 67287afa6ae5059f8af3dcdbd6910ca35db7c3c0..80e7f5c001d4b8139b538570c42fcd8d2604961b 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -19,7 +19,8 @@ register_operators(EXCLUDES
     fused_attention_op
     fused_transformer_op
     fused_feedforward_op
-    resnet_unit_op)
+    resnet_unit_op
+    fused_gemm_epilogue_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM)
         cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
         cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
     endif()
+
+    if (CUDA_VERSION GREATER_EQUAL 11.6)
+        op_library(fused_gemm_epilogue_op)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 20801d2243fb395b250f8416f1e2f5ba6a1423a4..3a2de0c4a093514a1c40321ab7dad61011709204 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary(
 template <typename T>
 void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
                            const T* in0, const T* in1, T* out) {
-  int in_vec_size = std::min(platform::GetVectorizedSize<T>(in0),
-                             platform::GetVectorizedSize<T>(in1));
-  int out_vec_size = std::min(4, platform::GetVectorizedSize<T>(out));
+  int in_vec_size =
+      std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1));
+  int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out));
   int vec_size = std::min(out_vec_size, in_vec_size);
 
   int numel = m * n;
@@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
 
   int num_block = (max_threads / left_num);
   if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-    *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block);
+    *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block);
     if (*blocking_size <= 1) {
-      *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num));
+      *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num));
     } else if (*blocking_size * 2 < reduce_num) {
       *blocking_size *= 2;
     }
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index bb5b363fe83995faf69f61b0a1a1693ff758fa37..5dbf4fb88b2a78838ce0fe95be653f68f4805416 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     Tensor transformed_input;
     std::vector<int> padding_common(data_dim, 0);
@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         default:
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 6119af18ce153ac2bcd5d45a69ab7b5d86a3cc10..b3ac3606eaf8ee843a2be98b7a237037afaf524f 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 1864bdbb86667290474d297cc481f5d6352c8022..a80f590aa495db8090a30118ed4128843c0f8860 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,10 +30,10 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(conv2d);
-USE_OP(conv2d_grad);
-USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
-USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN);
+USE_OP_ITSELF(conv2d);
+USE_OP_ITSELF(conv2d_grad);
+PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT);
+PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
 
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
@@ -404,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 3, output_channels = input_channels
@@ -420,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, output_channels = input_channels * 4
@@ -436,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 020277675797358bf87a58ac108e6eaaddb26ccc..54e4cbdc1624921e6946210a6a192d10fcbdb7dd 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -69,20 +70,21 @@ class FMHARef {
   ~FMHARef() {}
 
   void ComputeForward(const Tensor& qkv_input_tensor,
+                      const Tensor* cache_kv_tensor,
                       const Tensor* src_mask_tensor,
-                      Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor,
+                      Tensor* transpose_2_out_tensor,
+                      Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor,
                       Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
                       Tensor* dropout_mask_out_tensor,
                       Tensor* dropout_out_tensor, Tensor* qktv_out_tensor,
                       Tensor* fmha_out_tensor) {
     // input shape: [bs, seq_len, 3, num_head, head_dim]
-    // transpose with perm [2, 0, 1, 3, 4],
+    // transpose with perm [2, 0, 3, 1, 4],
     // output_shape: [3, bs, num_head, seq_len, head_dim]
     int ndims = 5;
     std::vector<int> perm_1 = {2, 0, 3, 1, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_input_tensor, perm_1,
                                 transpose_2_out_tensor);
-
     T* qkv_data = transpose_2_out_tensor->data<T>();
     T* qk_out_data = qk_out_tensor->data<T>();
     T* qktv_out_data = qktv_out_tensor->data<T>();
@@ -90,11 +92,30 @@ class FMHARef {
     T* dropout_out_data = dropout_out_tensor->data<T>();
     T* fmha_out_data = fmha_out_tensor->data<T>();
 
-    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
-    int k_size = q_size;
+    auto out_seq_len = seq_len_;
+    if (cache_kv_tensor) {
+      // kv [2, bs, num_head, seq_len, head_dim]
+      auto kv_tensor = transpose_2_out_tensor->Slice(1, 3);
+      phi::funcs::ConcatFunctor<phi::GPUContext, T> concat;
+      // out [2, bs, num_head, cache_seq_len + seq_len, head_dim]
+      concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor);
+      out_seq_len = cache_kv_out_tensor->dims()[3];
+    }
+
+    int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     T* q_ptr = qkv_data;
-    T* k_ptr = q_ptr + q_size;
-    T* v_ptr = k_ptr + k_size;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+
+    if (cache_kv_tensor) {
+      int64_t k_size = cache_kv_out_tensor->numel() / 2;
+      k_ptr = cache_kv_out_tensor->data<T>();
+      v_ptr = k_ptr + k_size;
+    } else {
+      int64_t k_size = q_size;
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + k_size;
+    }
 
     // q*k^t, batched_gemm
     CBLAS_TRANSPOSE transA = CblasNoTrans;
@@ -102,7 +123,7 @@ class FMHARef {
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     int gemm_batch_size = batch_size_ * num_head_;
     int gemm_m = seq_len_;
-    int gemm_n = seq_len_;
+    int gemm_n = out_seq_len;
     int gemm_k = head_dim_;
     T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
     T beta = static_cast<T>(0.0);
@@ -133,16 +154,16 @@ class FMHARef {
     transB = CblasNoTrans;
     gemm_m = seq_len_;
     gemm_n = head_dim_;
-    gemm_k = seq_len_;
+    gemm_k = out_seq_len;
     alpha = static_cast<T>(1.0);
     stride_a = gemm_m * gemm_k;
     stride_b = gemm_k * gemm_n;
 
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
-          dev_ctx_, dropout_param_.is_test_,
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          dropout_param_.is_test_, static_cast<const std::string>(
+                                       dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
@@ -242,8 +263,9 @@ class FMHARef {
     // dropout bw
     if (dropout_param_.dropout_prob_) {
       DropoutGradGPUKernelDriver<T>(
-          dev_ctx_, static_cast<const std::string>(
-                        dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_,
           static_cast<const Tensor&>(*dropout_out_grad_tensor),
           dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index d141800d61c0ec0b73fe2cc3c8d00dbf1de44cf2..e473f8ff0662cfc3fd7bdc5010bfa1dc08fba85f 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut",
                    "FusedAttentionOp");
 
+    if (ctx->HasInput("CacheKV")) {
+      OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut",
+                     "FusedAttentionOp");
+    }
     if (ctx->HasInput("SrcMask")) {
       OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut",
                      "FusedAttentionOp");
@@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                           "input qkv_weight = [%s]",
                           x_dim, y_dim));
 
-    PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
-                      platform::errors::InvalidArgument(
-                          "The dimensions of qkv_weight must be 4"
-                          "(3, num_head, dim_head, dim_embed),"
-                          "and must satisfy the limitations: "
-                          "(num_head * dim_head == dim_embed)"));
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
+                        platform::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(3, num_head, dim_head, dim_embed),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+    }
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
       ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
@@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // [3, batch_size, num_head, seq_len, head_size]
     ctx->SetOutputDim("TransposeOut2",
                       {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
-    // [batch, num_head, seq_len, seq_len]
-    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+
+    // cache_seq_len + seq_len if cache else seq_len
+    auto out_seq_len = x_dim[1];
+    if (ctx->HasInput("CacheKV")) {
+      // [2, batch_size, num_head, cache_seq_len, head_size]
+      auto c_dim = ctx->GetInputDim("CacheKV");
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(), 5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0], 2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0], c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            y_dim[1], c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GE(
+          c_dim[3], 0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            y_dim[2], c_dim[4]));  // head_size
+
+      out_seq_len += c_dim[3];
+      // [3, batch_size, num_head, cache_seq_len + seq_len, head_size]
+      ctx->SetOutputDim("CacheKVOut",
+                        {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]});
+    }
+
+    // [batch, num_head, seq_len, out_seq_len]
+    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
 
     if (ctx->HasInput("SrcMask")) {
-      ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+      ctx->SetOutputDim("SrcMaskOut",
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
-                      {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
-                        {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
-    ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+    ctx->SetOutputDim("SoftmaxOut",
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     // [batch_size, num_heads, seq_len, head_dim]
     ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
     // [batch_size, seq_len, number of heads*head size]
@@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddInput("QKVW", "The qkv weight tensor.");
     AddInput("QKVBias", "The qkv bias tensor.").AsDispensable();
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable();
     AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
         .AsDispensable();
     AddInput("OutLinearW", "The out_linear weight tensor.");
@@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("BiasDropoutResidualOut",
               "Result of residual + dropout(src + bias).")
         .AsIntermediate();
+    AddOutput("CacheKVOut", "The udpated cache KV.");
     AddOutput("Y", "Result after attention.");
 
     AddAttr<bool>("pre_layer_norm",
@@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "0.0 and 0.001, But received [%s].",
                                 ln_epsilon));
         });
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
 
     AddComment(R"DOC(
   Add fused attention op whose logic is as follows:
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 03f51fc5857985902c21ad12fefbdc9cdec6ef04..d26577f06fe683fb1528c61b4401b9e578c90c9f 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -27,11 +27,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor &tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext &ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void *sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void *recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename T>
 class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
@@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     auto *src_mask = ctx.Input<Tensor>("SrcMask");
     auto *transpose_out_2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *cache_kv = ctx.Input<Tensor>("CacheKV");
+    auto *cache_kv_out = ctx.Output<Tensor>("CacheKVOut");
     auto *qk_out = ctx.Output<Tensor>("QKOut");
     auto *qktv_out = ctx.Output<Tensor>("QKTVOut");
     auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
@@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // final output.
     auto *out = ctx.Output<Tensor>("Y");
@@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // get data ptr for FMHA.
     auto *transpose_out_2_data =
         transpose_out_2->mutable_data<T>(ctx.GetPlace());
+    auto *cache_kv_out_data =
+        (cache_kv_out == nullptr)
+            ? nullptr
+            : cache_kv_out->mutable_data<T>(ctx.GetPlace());
     auto *qk_out_data = qk_out->mutable_data<T>(ctx.GetPlace());
     auto *qktv_out_data = qktv_out->mutable_data<T>(ctx.GetPlace());
     auto *src_mask_out_data =
@@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     output_size = hidden_size;
     // (transA, transB, compute_bias) = (false, false, false)
+    // NOTE(Yuang Liu): For general input size == output size, change the
+    // position won't have effects. For mp, the output size is mp_head * dkey
+    // which is actually the input size. While the input size is hidden size,
+    // which is actually the output size. So for out linear, switch the
+    // input size and output size.
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), false, false, bsz_seq,
-                      output_size, input_size, false);
+                      input_size, output_size, false);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
                                  qkv_bias_out);
     }
     if (qkv_bias == nullptr) {
-      fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out,
+          src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out,
+          qktv_out, fmha_out);
     } else {
-      fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out,
+          qk_out, src_mask_out, softmax_out, attn_dropout_mask_out,
+          attn_dropout_out, qktv_out, fmha_out);
     }
 
     // fmha_out: [batch_size, seq_len, num_head, head_dim]
@@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // out_linear_out: [batch_size, seq_len, embed_dim]
     out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr,
                                       out_linear_out, nullptr);
+    // tensor model parallel
+    AllReduce<T>(*out_linear_out, ring_id, ctx.cuda_device_context());
+
     if (pre_layer_norm) {
       // output = (residual + dropout(input + bias))
       fused_dropout_layernorm_helper.ResidualDropoutBias(
@@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // get inputs.
     auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
@@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     transA = false;
     transB = false;
     bool compute_bias = false;
+    // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed)
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), transA, transB, bsz_seq,
-                      output_size, input_size, compute_bias);
+                      input_size, output_size, compute_bias);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_ln_out, ring_id, ctx.cuda_device_context());
       layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data,
                                          ln_mean_data, ln_var_data, d_x_data,
                                          d_ln_scale_data, d_ln_bias_data);
@@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx.cuda_device_context());
     }
     // gradient accumulation
     std::vector<const Tensor *> ins;
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
index 994601a2f0608b4fc04966c7549c421f395f3ec7..9f5a1bad047b44b715e11e74d92fdca1982c96f8 100755
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
                                     const T factor, const int64_t size, T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_vec;
     LoadT src_vec;
     MaskLoadT mask_vec;
 
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
-    platform::Load<T, VecSize>(&src[i], &src_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&src[i], &src_vec);
 
     StoreT dx_vec;
 #pragma unroll
@@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
       T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
       dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
     }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
                                         T *dx, T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
   if (col_id * VecSize < cols) {
@@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
       LoadT bias_vec;
       MaskLoadT mask_vec;
 
-      platform::Load<T, VecSize>(&dout[index], &dout_vec);
-      platform::Load<T, VecSize>(&src[index], &src_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
-      platform::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
+      phi::Load<T, VecSize>(&dout[index], &dout_vec);
+      phi::Load<T, VecSize>(&src[index], &src_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
 
       StoreT dx_vec;
 #pragma unroll
@@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
index 2381b5b7fdfb85cbaa3fd66a10c5b630bb515f15..717c1732b7b3acf8528887aae43471c0dc0716e3 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -20,8 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index f79277e4e8f0d22cedafc9f7b40b56ecd2d6a817..6bf3a7114f4ced3c7c6ecd1f1afeca60ff66528f 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index d7952df470d81566c3833e79e8cfa31a7d2dc68c..18c7187fc8e64c9fed8a86a984954b5420c1e5b5 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -31,7 +31,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 USE_OP(layer_norm);
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 56c2c86e1a70d64d4f96e10bbdd353dab4b7e932..7308f30779248e64f55e10b0661d2c98d263416c 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -243,12 +243,12 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
   auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
   if (platform::MayIUse(platform::avx)) {                                      \
-    math::VecActivations<T, platform::avx> act_functor;                        \
+    phi::funcs::VecActivations<T, platform::avx> act_functor;                  \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
   } else {                                                                     \
-    math::VecActivations<T, platform::isa_any> act_functor;                    \
+    phi::funcs::VecActivations<T, platform::isa_any> act_functor;              \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
@@ -473,7 +473,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     cell_out->mutable_data<T>(place);
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
@@ -591,7 +591,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
 #undef MOVE_ONE_BATCH
 #undef DEFINE_CUR
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_h_out, hidden_out);
     batched_c_out->set_lod(batched_lod);
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 0c8eae4260441f6c873b48735a01b967b70ef4bb..f3f8f1742757783a082437638f67407700963eb1 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<int>("dropout1_seed", "Dropout1 random seed.").SetDefault(0);
     AddAttr<int>("dropout2_seed", "Dropout2 random seed.").SetDefault(0);
+    AddAttr<int>("ring_id", "ring id for tensor model parallel.")
+        .SetDefault(-1);
     AddComment(R"DOC(
         the function of fused_feedforward operator is the same as the following pseudo code:
         residual = src;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 3131269955bdd17a0552836121589d8edeb4a38e..c38d9f7d4bcbd25b3111b35a918de0f4ebdabefb 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -21,11 +21,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor& tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext& ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void* sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void* recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class FusedFeedForwardKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
            framework::Tensor* dropout1_out, framework::Tensor* dropout2_out,
            const int bsz_seq, const int d_model, const int dim_feedforward,
            const std::string& act_method, const bool pre_layer_norm,
-           const float epsilon1, const float epsilon2,
+           const float epsilon1, const float epsilon2, const int ring_id,
            const DropoutParam& dropout_param1,
            const DropoutParam& dropout_param2,
            const platform::CUDADeviceContext& ctx) const {
@@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     framework::Tensor linear2_out;
     linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
     MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
+
+    // tensor model parallel
+    AllReduce<T>(linear2_out, ring_id, ctx);
+
     if (!pre_layer_norm) {
       fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
           ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
@@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
 
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
         dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance,
         linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model,
         dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2,
-        dropout_param1, dropout_param2, context.cuda_device_context());
+        ring_id, dropout_param1, dropout_param2, context.cuda_device_context());
   }
 };
 
@@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       const int dim_feedforward, const DropoutParam& dropout_param1,
       const DropoutParam& dropout_param2, const std::string& act_method,
       const bool pre_layer_norm, const float epsilon1, const float epsilon2,
-      const platform::CUDADeviceContext& ctx) const {
+      const int ring_id, const platform::CUDADeviceContext& ctx) const {
     FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
         bsz_seq, d_model, epsilon1);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
@@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
       MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out,
                  d_linear1_weight);
-
+      // tensor model parallel
+      AllReduce<T>(d_ln1_out, ring_id, ctx);
       pre_layernorm_helper.LayerNormGrad(
           ctx, d_ln1_out.data<T>(), x.data<T>(), ln1_gamma_ptr,
           ln1_mean->data<U>(), ln1_variance->data<U>(), d_x->data<T>(),
           d_ln1_gamma_ptr, d_ln1_beta_ptr);
     } else {
       MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx);
     }
     std::vector<const Tensor*> ins(2);
     std::vector<Tensor*> outs(1);
@@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
     const std::string act_method = context.Attr<std::string>("act_method");
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
             d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale,
             d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model,
             dim_feedforward, dropout_param1, dropout_param2, act_method,
-            pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context());
+            pre_layer_norm, epsilon1, epsilon2, ring_id,
+            context.cuda_device_context());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e3661e6d6edc5ea95b77cd283cc99afcca8ed
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -0,0 +1,353 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias",
+                   "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedGemmEpilogueOp");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto bias_dims = ctx->GetInputDim("Bias");
+
+    auto trans_x = ctx->Attrs().Get<bool>("trans_x");
+    auto trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        bias_dims.size(), 1,
+        platform::errors::InvalidArgument(
+            "The Input tensor bias's dimension of FusedGemmEpilogueOp "
+            " should be == 1, but got %d.",
+            bias_dims.size()));
+
+    PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The Input tensor bias's dimension 0"
+                          " should be == Y[-1], but got bias's shape = [%s] "
+                          "and Y's shape = [%s]",
+                          bias_dims, y_dims));
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
+
+    int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1];
+    int K_from_y = trans_y ? y_dims[1] : y_dims[0];
+
+    PADDLE_ENFORCE_EQ(
+        K_from_x, K_from_y,
+        platform::errors::InvalidArgument(
+            "The last dimension of X should be equal with Y's first dimension."
+            "But received X[-1] = [%d], Y[0] = [%d].",
+            K_from_x, K_from_y));
+
+    auto activation = ctx->Attrs().Get<std::string>("activation");
+
+    if ((activation != "relu") && (activation != "gelu") &&
+        (activation != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+
+    if (activation == "none" && ctx->HasOutput("ReserveSpace")) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The ReserveSpace would not be used when activation = \"none\""));
+    }
+
+    // cublasLt's restriction for auxiliary.
+    if (ctx->HasOutput("ReserveSpace") && activation != "none") {
+      int min_size_of_n = activation == "relu" ? 128 : 8;
+      int N_size = trans_y ? y_dims[0] : y_dims[1];
+      PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0,
+                        platform::errors::InvalidArgument(
+                            "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) "
+                            "should be multiple of %d when auxiliary_key given "
+                            "and activation=%s, but got N = %d.",
+                            min_size_of_n, activation, N_size));
+    }
+
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(static_cast<size_t>(x_dims.size()));
+    if (trans_x) {
+      for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]);
+    } else {
+      for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]);
+    }
+
+    if (trans_y) {
+      out_dims.push_back(y_dims[0]);
+    } else {
+      out_dims.push_back(y_dims[1]);
+    }
+
+    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    // Note (Ming Huang): Reserve space of relu is a bit-mask,
+    // which cannot pass nan_and_inf checking if shape is set.
+    if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
+      ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias).");
+    AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias).");
+    AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias).");
+
+    AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias).");
+    AddOutput("ReserveSpace",
+              R"DOC(Reserve GPU space to place 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable()
+        .AsExtra();
+
+    AddAttr<bool>(
+        "trans_x",
+        R"DOC((bool, default false), Whether to transpose input tensor X 
+    or not. The input tensor X coulbe be more than two dimension. When 
+    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+    [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trans_y",
+        R"DOC((bool, default false), Whether to transpose input tensor Y 
+    or not. The input tensor Y should be two dimension. When 
+    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+    [d0, d1] -> [d1, d0].)DOC")
+        .SetDefault(false);
+
+    AddAttr<std::string>(
+        "activation",
+        R"DOC((string, default none), The activation function. It could be 
+    one of {none, relu, gelu}. When none is given, Act would be null 
+    operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogue Operator
+This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)).
+It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU).
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut",
+                   "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp");
+
+    auto dout_dims = ctx->GetInputDim("DOut");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_GE(
+        dout_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            dout_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueGradOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        dout_dims.size(), x_dims.size(),
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's and X's dimension of "
+            "FusedGemmEpilogueGradOp "
+            " should be the same, but got DOut's dim = %d and X's = %d.",
+            dout_dims.size(), x_dims.size()));
+
+    auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1);
+
+    auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[1], y_dims[1],
+        platform::errors::InvalidArgument(
+            "The last dimension of DOut should be equal with Y's last"
+            "dimension. But received DOut[-1] = [%d], Y[1] = [%d].",
+            dout_mat_dims[1], y_dims[1]));
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[0], x_mat_dims[0],
+        platform::errors::InvalidArgument(
+            "The first dimension of DOut should be equal with X's first"
+            "dimension. But received DOut[0] = [%d], Y[0] = [%d].",
+            dout_mat_dims[0], x_mat_dims[0]));
+
+    auto activation_grad = ctx->Attrs().Get<std::string>("activation_grad");
+    if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") &&
+        (activation_grad != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation_grad));
+    }
+
+    if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) {
+      PADDLE_ENFORCE_EQ(true, false,
+                        platform::errors::InvalidArgument(
+                            "The ReserveSpace should not be empty. "
+                            "when activation_grad == {relu_grad, gelu_grad}."));
+    }
+
+    if (ctx->HasOutput("DX")) {
+      std::vector<int64_t> dx_dims;
+      dx_dims.reserve(static_cast<size_t>(x_dims.size()));
+      for (int i = 0; i < x_dims.size(); ++i) {
+        dx_dims.push_back(x_dims[i]);
+      }
+      ctx->SetOutputDim("DX", phi::make_ddim(dx_dims));
+    }
+
+    std::vector<int64_t> dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size());
+    ctx->SetOutputDim("DY", phi::make_ddim(dy_dims));
+
+    if (ctx->HasOutput("DBias")) {
+      std::vector<int64_t> dbias_dims;
+      dbias_dims.push_back(y_dims[1]);
+      ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("DOut",
+             "The input grad tensor to Out of Out = (Act(X) * Y) + bias");
+    AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias");
+    AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias");
+    AddInput("ReserveSpace",
+             R"DOC(A GPU space to fetch 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue_grad op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable();
+
+    AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+    AddOutput("DY",
+              "The output grad tensor to Y of Out = (Act(X) * Y) + bias.");
+    AddOutput("DBias",
+              "The output grad tensor to bias of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+
+    AddAttr<std::string>(
+        "activation_grad",
+        R"DOC((string, default none), The backward activation function. It could be 
+    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would 
+    be null operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogueGrad Operator
+This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias).
+It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear.
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
+                  ops::FusedGemmEpilogueOpMaker)
+REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp,
+                  ops::FusedGemmEpilogueGradOpMaker)
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e16c9e8f483ccc2cbf1d7006159cccfe906dd06b
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* bias = ctx.Input<Tensor>("Bias");
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::string activation = ctx.Attr<std::string>("activation");
+    bool enable_auxiliary = reserve_space == nullptr ? false : true;
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto* out_data = out->data<T>();
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
+    int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
+    int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtMatmulDesc_t operation_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+        &operation_desc, compute_type, scale_type));
+    cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx,
+            sizeof(transx)));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy,
+            sizeof(transy)));
+
+    cublasLtEpilogue_t epiloque_func =
+        get_epilogue_type_(activation, enable_auxiliary);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func,
+            sizeof(epiloque_func)));
+    const T* bias_data = bias->data<T>();
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data,
+            sizeof(bias_data)));
+
+    if (enable_auxiliary && activation != "none") {
+      size_t reserve_space_size = 0;
+      if (activation == "relu") {
+        // Count in bits.
+        reserve_space_size = phi::product(out->dims()) / 8;
+      } else {
+        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+      }
+      reserve_space->mutable_data(ctx.GetPlace(), out->type(),
+                                  reserve_space_size);
+      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+              &aux_data, sizeof(aux_data)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+              sizeof(N)));
+    }
+
+    cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
+    if (trans_x)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, M, K, M));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+    if (trans_y)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, K, N, K));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &out_desc, mat_type, N, M, N));
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+    memory::allocation::AllocationPtr workspace =
+        memory::Alloc(dev_ctx, workspace_size);
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+        lt_handle, operation_desc, alpha, y->data<T>(), y_desc, x->data<T>(),
+        x_desc, beta, out_data, out_desc, out_data, out_desc, algo,
+        workspace->ptr(), workspace_size, stream));
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation,
+                                               bool enable_auxiliary) {
+    if (activation == "relu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_RELU_BIAS;
+    } else if (activation == "gelu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_GELU_BIAS;
+    } else if (activation == "none") {
+      return CUBLASLT_EPILOGUE_BIAS;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* dout = ctx.Input<Tensor>("DOut");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    Tensor* dx = ctx.Output<Tensor>("DX");
+    Tensor* dy = ctx.Output<Tensor>("DY");
+    Tensor* dbias = ctx.Output<Tensor>("DBias");
+
+    std::string activation_grad = ctx.Attr<std::string>("activation_grad");
+
+    auto dout_mat_dims =
+        phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1);
+    auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1);
+
+    int64_t M = x_mat_dims[0];
+    int64_t K = y->dims()[0];
+    int64_t N = y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    cublasOperation_t trans_dout = CUBLAS_OP_N;
+    cublasLtMatrixLayout_t dout_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &dout_desc, mat_type, N, M, N));
+
+    if (dx) {
+      cublasLtMatmulDesc_t dx_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dx_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_y = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y,
+              sizeof(trans_y)));
+      cublasLtEpilogue_t epiloque_func_for_dx =
+          get_epilogue_type_(activation_grad);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dx, sizeof(epiloque_func_for_dx)));
+
+      if (activation_grad != "none") {
+        auto* aux_data = reserve_space->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+                &aux_data, sizeof(aux_data)));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+                sizeof(N)));
+      }
+
+      cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dx_desc, mat_type, K, M, K));
+
+      memory::allocation::AllocationPtr dx_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dx->mutable_data<T>(ctx.GetPlace());
+      auto* dx_data = dx->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dx_operation_desc, alpha, y->data<T>(), y_desc,
+          dout->data<T>(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc,
+          algo, dx_workspace->ptr(), workspace_size, stream));
+    }
+
+    if (dy) {
+      cublasLtMatmulDesc_t dy_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dy_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_x = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x,
+              sizeof(trans_x)));
+      cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr
+                                                    ? CUBLASLT_EPILOGUE_DEFAULT
+                                                    : CUBLASLT_EPILOGUE_BGRADA;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dy, sizeof(epiloque_func_for_dy)));
+
+      if (dbias) {
+        dbias->mutable_data<T>(ctx.GetPlace());
+        auto* dbias_data = dbias->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                &dbias_data, sizeof(dbias_data)));
+      }
+
+      cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dy_desc, mat_type, N, K, N));
+
+      memory::allocation::AllocationPtr dy_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dy->mutable_data<T>(ctx.GetPlace());
+      auto* dy_data = dy->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dy_operation_desc, alpha, dout->data<T>(), dout_desc,
+          x->data<T>(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo,
+          dy_workspace->ptr(), workspace_size, stream));
+    }
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(
+      const std::string& activation_grad) {
+    if (activation_grad == "relu_grad") {
+      return CUBLASLT_EPILOGUE_DRELU;
+    } else if (activation_grad == "gelu_grad") {
+      return CUBLASLT_EPILOGUE_DGELU;
+    } else if (activation_grad == "none") {
+      return CUBLASLT_EPILOGUE_DEFAULT;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation_grad attribute of fused_gemm_epilogue op should "
+              "be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation_grad=%s.",
+              activation_grad));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDA_VERSION >= 11060
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue_grad,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index ceba3accca7727b5e4f22951d87f9e91034e3403..d53a24a57e3cc1ede127f497a9be9e3b5fa1ab0b 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -42,12 +42,12 @@ __device__ void CalcLayernormY(
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
     T *y, const int row_id, const int col_id, const int cols,
     const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using LoadU = platform::AlignedVector<U, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using LoadU = phi::AlignedVector<U, VecSize>;
   using LoadScaleOrBias =
-      platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                              VecSize>;
+      phi::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                         VecSize>;
   for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
     LoadScaleOrBias scale_vec;
     LoadScaleOrBias bias_vec;
@@ -60,15 +60,15 @@ __device__ void CalcLayernormY(
           static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
     }
     // vectorize load data from global
-    platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
+    phi::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
 
     if (scale != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&scale[i], &scale_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &scale[i], &scale_vec);
     }
     if (bias != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&bias[i], &bias_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &bias[i], &bias_vec);
     }
 
     StoreT y_vec;
@@ -78,7 +78,7 @@ __device__ void CalcLayernormY(
                              (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
                          static_cast<U>(bias_vec[ii]));
     }
-    platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
+    phi::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
   }
 }
 
@@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     Vec residual[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
-      platform::Load<T, VecSize>(
-          residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &residual[it]);
       col += THREADS_PER_ROW;
     }
 
@@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 // store dropout_residual_out and mask_out
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(
+      phi::Store<T, VecSize>(
           x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
-      platform::Store<MaskType, VecSize>(
+      phi::Store<MaskType, VecSize>(
           mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
@@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index cc14d0680d381ff2bbe73ee712e218c9c4d79185..032440d7f0478dc087e3ba38274f2a31a9a66a23 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 /**
  * @brief The unit test of fused_layernorm_residual_dropout_bias
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 1b135ad6098e58f457f5d21e73ac6d1a6a7c4074..1d3085a013f81ee9dca21468476df8f621bb26c2 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test,
     typename details::MPTypeTrait<T>::Type *mean_val,
     typename details::MPTypeTrait<T>::Type *var_val, Functor act_func) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
   using U = typename details::MPTypeTrait<T>::Type;
 
   LoadT src_vec;
@@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     residual_vec[ii] = static_cast<T>(0);
   }
   // vectorize load data from global
-  platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
   if (residual) {
-    platform::Load<T, VecSize>(&residual[row_id * cols + col_id],
-                               &residual_vec);
+    phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
   }
 
   if (bias) {
-    platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
+    phi::Load<T, VecSize>(&bias[col_id], &bias_vec);
   }
 
   MaskStoreT mask_vec;
@@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
   }
 
   // store result to global
-  platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
   if (!is_test) {
-    platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
+    phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
   }
 }
 
@@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
                                          T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_vec;
     MaskLoadT mask_vec;
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
 
     StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
       dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
     }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
                                              T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
@@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
       LoadT out_vec;
       MaskLoadT mask_vec;
       StoreT dx_vec;
-      platform::Load<T, VecSize>(&dout[index], &out_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&dout[index], &out_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
 
 #pragma unroll
       for (int i = 0; i < VecSize; i++) {
@@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
         tmp_sum[i] += out_vec[i];
       }
 
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 1a12e6b565f02035b3fb9673636c2344823f288e..5dff5e2225f4f3bf3a20daa02b2b4194bd8cb99e 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 41a69031c54b31cd7e67ce428e710b3a87081f48..3311e3b4ebc9e21d0a033e54ba162e72a80326d0 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -368,7 +368,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     math::FCFunctor<DeviceContext, T> fc;
     if (M > D3) {
@@ -463,7 +463,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       batched_input_data = cur_batched_data;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_out, hidden_out);
   }
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 06d406867f07431999f11d76e907a75fcc917ff2..00be8b09d1296018f36c0299f415b7c27f0fad14 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -421,7 +421,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     cell_out->mutable_data<T>(place);
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     math::FCFunctor<DeviceContext, T> fc;
@@ -514,7 +514,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       batched_input_data = cur_in_data;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_h_out, hidden_out);
     batched_c_out->set_lod(batched_lod);
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 88fb7349d538afd6d7bf4fa6947ac21307db66d8..1000d0488dc3ffcf6cde977be47ce77d2bc947a7 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -196,10 +196,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
     std::function<void(const int, const T*, T*)> fc_act;
     auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
     if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
+      phi::funcs::VecActivations<T, platform::avx> act_functor;
       fc_act = act_functor(fc_act_str);
     } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
       fc_act = act_functor(fc_act_str);
     }
 
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index 84826ff3993ff7a746d34294311c9b8b429f5ea6..c2260c53b2edd09dd69d126bc5e61b995fb20467 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index 8da900d84f9bcedd5e4b318837fe1bb29697a6be..e5ca15a39ef51f7807246c2ee1d473a0499b6463 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/phi/core/ddim.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,48 +25,10 @@ class GatherNdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherNdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherNdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherNdOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_size = x_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], x_dims_size,
-        platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-    PADDLE_ENFORCE_GE(index_dims_size, 1UL,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
-
-    std::vector<int64_t> result_dims;
-    // The result dims is
-    //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
-    for (int i = 0; i < index_dims_size - 1; ++i) {
-      result_dims.emplace_back(index_dims[i]);
-    }
-    for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
-      result_dims.emplace_back(x_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(result_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<framework::Tensor>("X");
     const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     return framework::OpKernelType(
         x_type,
@@ -80,11 +42,6 @@ class GatherNdGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,23 +130,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X");
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdGradInferMeta));
+
 REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
                   ops::GatherNdGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherNdGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherNdGradOpMaker<paddle::imperative::OpBase>,
+                  GatherNdInferShapeFunctor);
 
 REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp,
-                  ops::GatherNdGradNoNeedBufferVarInferer);
-
-REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel<float>,
-                       ops::GatherNdOpKernel<double>,
-                       ops::GatherNdOpKernel<int64_t>,
-                       ops::GatherNdOpKernel<int>,
-                       ops::GatherNdOpKernel<int16_t>,
-                       ops::GatherNdOpKernel<bool>,
-                       ops::GatherNdOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel<float>,
-                       ops::GatherNdGradOpKernel<double>,
-                       ops::GatherNdGradOpKernel<int64_t>,
-                       ops::GatherNdGradOpKernel<int>,
-                       ops::GatherNdGradOpKernel<uint8_t>);
+                  ops::GatherNdGradNoNeedBufferVarInferer,
+                  GatherNdGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
deleted file mode 100644
index 0de2798bf750915e99c9b60ed8ccb94d7d1201ab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int16_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, bool>,
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
deleted file mode 100644
index f458c0e18013b4d7a85d960e0e7df1b2d21638fe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherNdOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
index 995ab5d0ddf0fda19a163ec31a00a14985b5dbb9..c916f44b874a08a13fb967aae1f8b6a136023b31 100644
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc
index 9f4c522bd145bedd09fd746781cef5efec15c139..d4cb799e825b640a2a4e0a464e18d63c5e5ed516 100644
--- a/paddle/fluid/operators/gather_nd_op_xpu.cc
+++ b/paddle/fluid/operators/gather_nd_op_xpu.cc
@@ -11,7 +11,10 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -20,9 +23,9 @@ template <typename T>
 class GatherNdXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<framework::Tensor>("X");
+    auto *index = ctx.Input<framework::Tensor>("Index");
+    auto *out = ctx.Output<framework::Tensor>("Out");
 
     out->template mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index cf4d7b1d670b8add6ff5a138851c6a23ee54169e..8a405cc6fc1baefe997fb5b6133a56d6a2fc0438 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
                        ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>);
+                       ops::GatherOpKernel<int64_t>,
+                       ops::GatherOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
                        ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>);
+                       ops::GatherGradientOpKernel<int64_t>,
+                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_VERSION(gather)
     .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                    paddle::framework::compatible::OpVersionDesc().NewInput(
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 19568835a6e96080bb1c0af642bf9cb19c346bf9..e0db2f26d3e0534f924cc709b98689fb3f1a5cc6 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -45,15 +45,23 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
         axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
       } else if (axis_type == framework::proto::VarType::INT64) {
         axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
+      } else if (axis_type == framework::proto::VarType::INT16) {
+        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
       }
     }
     const auto &place = ctx.GetPlace();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &dev_ctx = ctx.cuda_device_context();
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2CUDAFunction<T, int32_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
+                                                     dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
+                                                     dev_ctx);
+      } else if (index_type == framework::proto::VarType::INT16) {
+        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
+                                                     dev_ctx);
       }
       return;
     }
@@ -61,9 +69,11 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
+    } else if (index_type == framework::proto::VarType::INT16) {
+      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -93,14 +103,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 
+    const auto &dev_ctx = ctx.cuda_device_context();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       }
       return;
     }
@@ -112,11 +123,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
+                                           ctx.Attr<bool>("overwrite"));
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
+                                               ctx.Attr<bool>("overwrite"));
     }
   }
 };
@@ -130,9 +141,12 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
                         ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<plat::float16>);
+                        ops::GatherOpCUDAKernel<int16_t>,
+                        ops::GatherOpCUDAKernel<plat::float16>,
+                        ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
                         ops::GatherGradOpCUDAKernel<double>,
                         ops::GatherGradOpCUDAKernel<int64_t>,
                         ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>);
+                        ops::GatherGradOpCUDAKernel<plat::float16>,
+                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 016c2b398daaad92ec60e37606345e0c6c4e13f5..94de694b2f9bc484cdb60298b60d5a9433dac181 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel<T> {
     // get axis from tensor
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2Function<T, int32_t>(x, index, axis, output, place);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2Function<T, int64_t>(x, index, axis, output, place);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
+                                                 output);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
+                                                 output);
       }
       return;
     }
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradFunction<T, int32_t>(dO, index, axis, dX, ctx.GetPlace());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradFunction<T, int64_t>(dO, index, axis, dX, ctx.GetPlace());
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
+                                                     dX);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
+                                                     dX);
       }
       return;
     }
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto &place = *dev_ctx.eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     bool overwrite = ctx.Attr<bool>("overwrite");
 
-    if (index_type == framework::proto::VarType::INT32) {
+    if (index_type == phi::DataType::INT32) {
       if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
       }
-    } else if (index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == phi::DataType::INT64) {
       if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
       }
     }
   }
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 0f3dcdadcf897dc05d131225cdffe11f84043c14..c962dd065234f37fe98481c9866f7d2f405db69c 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 
 TEST(Gather, GatherData) {
   paddle::framework::Tensor* src = new paddle::framework::Tensor();
@@ -39,7 +39,7 @@ TEST(Gather, GatherData) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  phi::funcs::CPUGather<int>(ctx, *src, *index, output);
   delete cpu_place;
   cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 830134e57e0e72c5470ac79714015a94df9888bf..c84e94f5c71277c4fe8f25b73b266169f0d0877a 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_tree_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
-
-    auto ids_dims = ctx->GetInputDim("Ids");
-    auto parents_dims = ctx->GetInputDim("Parents");
-    PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Parents) must be same with the "
-                          "shape of Input(Ids)."));
-    ctx->SetOutputDim("Out", ids_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -72,6 +61,8 @@ selected ids.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
-REGISTER_OP_CPU_KERNEL(gather_tree, ops::GatherTreeOpKernel<int32_t>,
-                       ops::GatherTreeOpKernel<int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PD_INFER_META(phi::GatherTreeMeta));
+
+REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
+                  GatherTreeInferShapeFunctor);
diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu
deleted file mode 100644
index 829682764a674db93728413b07133a41e72246b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_tree_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_tree_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void GatherTree(const T *ids_data, const T *parents_data,
-                           T *out_data, const int64_t max_length,
-                           const int64_t batch_size, const int64_t beam_size) {
-  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
-    int batch = i / beam_size;
-    int beam = i % beam_size;
-    auto idx =
-        (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
-    out_data[idx] = ids_data[idx];
-    auto parent = parents_data[idx];
-    for (int step = max_length - 2; step >= 0; step--) {
-      idx = step * batch_size * beam_size + batch * beam_size;
-      out_data[idx + beam] = ids_data[idx + parent];
-      parent = parents_data[idx + parent];
-    }
-  }
-}
-
-template <typename T>
-class GatherTreeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids = ctx.Input<Tensor>("Ids");
-    auto *parents = ctx.Input<Tensor>("Parents");
-    auto *out = ctx.Output<Tensor>("Out");
-
-    const auto *ids_data = ids->data<T>();
-    const auto *parents_data = parents->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ids_data, platform::errors::InvalidArgument(
-                      "Input(Ids) of gather_tree should not be null."));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        parents_data, platform::errors::InvalidArgument(
-                          "Input(Parents) of gather_tree should not be null."));
-
-    auto &ids_dims = ids->dims();
-    int64_t max_length = ids_dims[0];
-    int64_t batch_size = ids_dims[1];
-    int64_t beam_size = ids_dims[2];
-
-    auto &dev_ctx = ctx.cuda_device_context();
-
-    const int block = 512;
-    int max_threads =
-        std::min(static_cast<int64_t>(dev_ctx.GetMaxPhysicalThreadCount()),
-                 batch_size * beam_size);
-    const int grid = std::max(max_threads / block, 1);
-    GatherTree<<<grid, block>>>(ids_data, parents_data, out_data, max_length,
-                                batch_size, beam_size);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(gather_tree, ops::GatherTreeOpCUDAKernel<int32_t>,
-                        ops::GatherTreeOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h
deleted file mode 100644
index e035a30e7954feaf06f197211b2a2ca266cfd473..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_tree_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherTreeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids = ctx.Input<Tensor>("Ids");
-    auto *parents = ctx.Input<Tensor>("Parents");
-    auto *out = ctx.Output<Tensor>("Out");
-
-    const auto *ids_data = ids->data<T>();
-    const auto *parents_data = parents->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    auto &ids_dims = ids->dims();
-    auto max_length = ids_dims[0];
-    auto batch_size = ids_dims[1];
-    auto beam_size = ids_dims[2];
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ids_data, platform::errors::InvalidArgument(
-                      "Input(Ids) of gather_tree should not be null."));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        parents_data, platform::errors::InvalidArgument(
-                          "Input(Parents) of gather_tree should not be null."));
-
-    for (int batch = 0; batch < batch_size; batch++) {
-      for (int beam = 0; beam < beam_size; beam++) {
-        auto idx = (max_length - 1) * batch_size * beam_size +
-                   batch * beam_size + beam;
-        out_data[idx] = ids_data[idx];
-        auto parent = parents_data[idx];
-        for (int step = max_length - 2; step >= 0; step--) {
-          idx = step * batch_size * beam_size + batch * beam_size;
-          out_data[idx + beam] = ids_data[idx + parent];
-          parent = parents_data[idx + parent];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 774ff0bd065995916562061784f5218336a9da93..66eecc13d04d1aa7d4532b69f7a2fbe8c62b8e6f 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,38 +15,19 @@ limitations under the License. */
 #include <random>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-
-    std::normal_distribution<T> dist(mean, std);
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-    int64_t size = tensor->numel();
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
@@ -75,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom");
-
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    if (shape.empty() && ctx->HasInput("ShapeTensor")) {
-      auto shape_dims = ctx->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_dims));
-
-      return;
-    }
-    if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
-      PADDLE_ENFORCE_GT(
-          shape.size(), 0UL,
-          platform::errors::InvalidArgument(
-              "Attribute(shape) of GaussianRandomOp must be set "
-              "and shape.size() > 0, but reveived shape.size() is %d",
-              shape.size()));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(temp));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -192,13 +141,20 @@ Used to initialize tensors with gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
-                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
+
+DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor,
+                            PD_INFER_META(phi::GaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    GaussianRandomInferShapeFunctor);
+
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
+
 REGISTER_OP_VERSION(gaussian_random)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 21d827c79200c4a368ce7677b01b18ee4ddedb8d..00ce10bfe3bccb404bce9f681ee3c7030e0fa4c4 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -19,9 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
+
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 
 DECLARE_bool(use_curand);
 
@@ -44,7 +45,8 @@ struct GaussianGenerator {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     using MT = typename details::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(mean_, std_);
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
     MT out = dist(rng);
@@ -52,53 +54,6 @@ struct GaussianGenerator {
   }
 };
 
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::normal_distribution<MT> dist;
-        distribution::normal_transform<MT> trans(mean, std);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        auto func =
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
-        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-      }
-    } else {
-      auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    }
-  }
-};
-
 template <typename T>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
@@ -126,21 +81,16 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
       int64_t gen_offset = size * seed_offset.second;
       auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
                                        seed_offset.second);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
       auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    gaussian_random,
-    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
-    paddle::operators::GPUGaussianRandomKernel<float>,
-    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
     gaussian_random_batch_size_like,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 6b778eee4345170a0288bc5741c6c1078615022f..ef836ab72f001a540e081d7e9975ca5ee28758be 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
       static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
   size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
   for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
     ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
@@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
       static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
   size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
   for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
     ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
     ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
 #pragma unroll
@@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
   do {                                                                        \
     constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
     if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
         is_aligned(y, kAlignment)) {                                          \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
@@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
   do {                                                                        \
     constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
     if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
         is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
         is_aligned(x_g, kAlignment)) {                                        \
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index 00ff7ad2166dcf99d7b60ec45adfe70b478dedf8..f3ac53138328dbfad12c6d530a6517f40c658677 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index 6af8388d9eba4e4ea8fbb833f84a5c06e182b1f2..f7c006dbcb1a9a23ec619c8d790df8a093530eee 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/graph_send_recv_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv");
-
-    auto src_index_dims = ctx->GetInputDim("Src_index");
-    if (src_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(src_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Src_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            src_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          src_index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The Src_index should be 1D, when it is not 2D, but we get %d",
-              src_index_dims.size()));
-    }
-
-    auto dst_index_dims = ctx->GetInputDim("Dst_index");
-    if (dst_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(dst_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Dst_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            dst_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dst_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The Dst_index should be 1D, "
-                                            "when it is not 2D, but we get %d",
-                                            dst_index_dims.size()));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        src_index_dims[0], dst_index_dims[0],
-        platform::errors::InvalidArgument(
-            "Src_index and Dst_index should have the same shape."));
-
-    auto dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
-                     "GraphSendRecv");
-      ctx->SetOutputDim("Dst_count", {dims[0]});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor,
+                            PD_INFER_META(phi::GraphSendRecvInferMeta));
 REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
                   ops::GraphSendRecvOpMaker,
                   ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
+                  GraphSendRecvInferShapeFunctor);
 REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
-REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel<CPU, float>,
-                       ops::GraphSendRecvOpKernel<CPU, double>,
-                       ops::GraphSendRecvOpKernel<CPU, int>,
-                       ops::GraphSendRecvOpKernel<CPU, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(graph_send_recv_grad,
-                       ops::GraphSendRecvGradOpKernel<CPU, float>,
-                       ops::GraphSendRecvGradOpKernel<CPU, double>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
deleted file mode 100644
index f43d31814ac38430d2d473eeca548b63e1a5c1fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/graph_send_recv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename IndexT>
-struct GraphSendRecvSumCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMaxCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMinCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-__global__ void GraphSendRecvCUDAKernel(const T* params,
-                                        const IndexT* src_indices,
-                                        const IndexT* dst_indices, T* output,
-                                        size_t index_size, size_t slice_size,
-                                        Functor functor) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    functor(params, output, in_i, out_i);
-  }
-}
-
-// For max
-template <typename T>
-__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::min()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// For min
-template <typename T>
-__global__ void InputResetMinCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::max()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// Get dst_count
-template <typename T, typename IndexT>
-__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices,
-                                       size_t index_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
-    IndexT dst_i = dst_indices[i];
-    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
-  }
-}
-
-// For forward mean
-template <typename T>
-__global__ void ManipulateMeanCUDAKernel(T* output, int* count,
-                                         size_t input_size, size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    int64_t c_index = i / slice_size;
-    if (*(count + c_index) > 1) {
-      *(output + i) = *(output + i) / *(count + c_index);
-    }
-  }
-}
-
-// For backward mean
-template <typename T, typename IndexT>
-__global__ void ManipulateMeanGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const int* dst_count) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(output + out_i,
-                                    *(params + in_i) / dst_count[src_i]);
-  }
-}
-
-// For backward min and max
-template <typename T, typename IndexT>
-__global__ void ManipulateMinMaxGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const T* ptr_input,
-    const T* ptr_output) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(
-        output + out_i,
-        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* Y = ctx.Output<Tensor>("Out");
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  if (pool_type == "SUM" || pool_type == "MEAN") {
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_output, 0, memset_bytes);
-#else
-    cudaMemset(p_output, 0, memset_bytes);
-#endif
-  } else if (pool_type == "MAX") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::min());
-  } else if (pool_type == "MIN") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::max());
-  }
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MAX") {
-    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMaxCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_max =
-        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
-    InputResetMaxCUDAKernel<
-        T><<<grid_max, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MIN") {
-    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMinCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_min =
-        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
-    InputResetMinCUDAKernel<
-        T><<<grid_min, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MEAN") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_dst_count, 0, input_size * sizeof(int));
-#else
-    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
-#endif
-
-    int64_t grid_count = (index_size + block - 1) / block;
-    ComputeCountCUDAKernel<
-        T, IndexT><<<grid_count, block, 0,
-                     reinterpret_cast<const platform::CUDADeviceContext&>(
-                         ctx.device_context())
-                         .stream()>>>(p_dst_count, d_index, index_size);
-
-    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_mean =
-        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
-    ManipulateMeanCUDAKernel<
-        T><<<grid_mean, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, p_dst_count, input_size, slice_size);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-
-#ifdef PADDLE_WITH_HIP
-  hipMemset(p_output, 0, memset_bytes);
-#else
-  cudaMemset(p_output, 0, memset_bytes);
-#endif
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    ManipulateMeanGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, s_count);
-  } else if (pool_type == "MAX" || pool_type == "MIN") {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Input<Tensor>("Out");
-    const T* ptr_input = input->data<T>();
-    const T* ptr_output = output->data<T>();
-    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, ptr_input,
-                                         ptr_output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto* dst_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h
deleted file mode 100644
index 8d8111e0ee845bf6828ee53459e6d86bdebba484..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct GraphSendRecvSumFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    eigen_dst += eigen_src;
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMinFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMin(eigen_src);
-    }
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMaxFunctor {
-  void operator()(const int& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMax(eigen_src);
-    }
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-void elementwise_inner_operation(const Tensor& src, Tensor* dst,
-                                 const IndexT& src_index,
-                                 const IndexT& dst_index,
-                                 const bool& first_flag, Functor functor) {
-  auto src_slice = src.Slice(src_index, src_index + 1);
-  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
-
-  functor(first_flag, src_slice, &dst_slice);
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size,
-                                  const IndexT* s_index, const IndexT* d_index,
-                                  const Tensor& src, Tensor* dst,
-                                  const std::string& pool_type,
-                                  int* dst_count = nullptr) {
-  Functor functor;
-  if (pool_type == "SUM") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-    for (int i = 0; i < index_size; ++i) {
-      IndexT dst_idx = d_index[i];
-      *(dst_count + dst_idx) += 1;
-    }
-    for (int i = 0; i < input_size; ++i) {
-      if (*(dst_count + i) == 0) continue;
-      auto dst_slice = dst->Slice(i, i + 1);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    std::set<IndexT> existed_dst;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
-      if (!in_set) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                        dst_idx, true, functor);
-        existed_dst.emplace(dst_idx);
-      } else {
-        elementwise_inner_operation<T, IndexT, Functor>(
-            src, dst, src_idx, dst_idx, false, functor);
-      }
-    }
-  }
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop_grad(
-    const int& input_size, const int& index_size, const IndexT* s_index,
-    const IndexT* d_index, const Tensor& src, Tensor* dst,
-    const std::string& pool_type, const int* dst_count = nullptr,
-    const Tensor* input = nullptr, const Tensor* output = nullptr) {
-  if (pool_type == "SUM") {
-    Functor functor;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      auto src_slice = src.Slice(src_idx, src_idx + 1);
-      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& forward_src_idx = d_index[i];
-      const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
-      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);
-      auto eigen_output = framework::EigenVector<T>::Flatten(output_slice);
-
-      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += eigen_src * (eigen_output == eigen_input);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                       const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* dst_index = ctx.Input<Tensor>("Dst_index");
-  auto* Y = ctx.Output<Tensor>("Out");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MIN") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MAX") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
-        p_dst_count);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* dst_index = ctx.Input<Tensor>("Src_index");
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    const auto* input = ctx.Input<Tensor>("X");
-    const auto* output = ctx.Input<Tensor>("Out");
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
-        input, output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
-                                                                   *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx,
-                                                                   *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 8f3c6660f51c4de80e5a98370eae0381abe333a6..93e96694270a458844bbcabf78f2559975902c2f 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 72a90d17998d84f0d0d4e081543acae94756e635..b376334f1e93cc3be9e716d808525011edb29b94 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -29,6 +29,7 @@ namespace operators {
 
 using DataLayout = framework::DataLayout;
 enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+#define ALIGN_BYTES 16
 
 #define CHECK_CASE(i, flags, kernel_name, ...)                              \
   if (i == flags) {                                                         \
@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
 template <typename T>
 __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
                                               int imsize, int groups,
-                                              int group_size, T* mean, T* var,
-                                              const DataLayout data_layout) {
+                                              int group_size, T* mean, T* var) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   T x_mean = 0, x_var = 0;
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid];
+
     x_mean += val;
     x_var += val * val;
   }
@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
+template <typename T, typename AccT, int VecSize>
+__device__ __forceinline__ void ThreadReduce(const T* input, int size,
+                                             const int offset, AccT* mean,
+                                             AccT* var) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  if (offset > 0) {
+    input -= offset;
+    size += offset;
+    if (tid >= offset) {
+      AccT temp = input[tid];
+      *mean += temp;
+      *var += temp * temp;
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      AccT temp = ins[i];
+      *mean += temp;
+      *var += temp * temp;
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    AccT temp = input[tid];
+    *mean += temp;
+    *var += temp * temp;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
+  int i = blockIdx.x;
+  T x_mean = 0, x_var = 0;
+  for (int j = threadIdx.x; j < size; j += blockDim.x) {
+    T val;
+    val = x[i * size + j];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= size;
+  x_var /= size;
+  CudaAtomicAddWithWarp(&mean[i], x_mean);
+  CudaAtomicAddWithWarp(&var[i], x_var);
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
+                                            int size) {
+  int i = blockIdx.x;
+  AccT x_mean = static_cast<AccT>(0);
+  AccT x_var = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * size;
+  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_mean, kps::AddFunctor<AccT>());
+  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_var, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[i] = static_cast<T>(x_mean / size);
+    var[i] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T, int flags>
 __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
                                  const T* scale, const T* bias, int N, int C,
@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
   int H = imsize / W;
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
+  auto ng = bid * groups + gid;
+  T x_mean = mean[ng];
+  T x_var = var[ng];
   x_var = x_var - x_mean * x_mean;
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  T var_inv = rsqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) {
+    real_var[ng] = x_var;
+  }
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val;
     int hid, wid;
+    int index = (bid * C + ccid) * imsize + imid;
     if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
+      val = x[index];
     } else {
       hid = imid / W;
       wid = imid % W;
       val = x[(bid * H + hid) * W * C + wid * C + ccid];
     }
     val = (val - x_mean) * var_inv;
-    if (flags & kHasScale) val *= scale[gid * group_size + cid];
-    if (flags & kHasBias) val += bias[gid * group_size + cid];
+    if (flags & kHasScale) {
+      val *= scale[ccid];
+    }
+    if (flags & kHasBias) {
+      val += bias[ccid];
+    }
     if (data_layout == DataLayout::kNCHW) {
-      y[(bid * C + ccid) * imsize + imid] = val;
+      y[index] = val;
     } else {
       y[(bid * H + hid) * W * C + wid * C + ccid] = val;
     }
@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
         imsize *= x_dims[i];
       }
     }
+
 #ifdef __HIPCC__
     int block_size = std::max(std::min(256, imsize), 64);
 #else
     int block_size = std::min(1024, imsize);
 #endif
+
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
-    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
-        temp_var_data, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      int size = group_size * imsize;
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(size / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 grids(x_dims[0] * groups);
+      dim3 blocks(block_size_nchw);
+      if (size < vec_size) {
+        ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      } else {
+        VectorizedGetMeanAndVarNCHW<
+            T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      }
+    } else {
+      GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+          x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
+          temp_var_data);
+    }
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
     UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 88530b5352d31df7fac6eb122867f275777e40f6..d7cf03ddd6189393d16281b434c4dd5b4984e923 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
 #include <memory>
 #include <string>
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
 
 DECLARE_int32(paddle_num_threads);
 
@@ -316,7 +316,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     batch_hidden->mutable_data<T>(context.GetPlace());
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
@@ -326,7 +326,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     }
 
     int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
@@ -347,9 +347,9 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t seq_len = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
+    auto active_node = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
+    auto active_gate = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("gate_activation"));
 
 #ifdef PADDLE_WITH_MKLML
@@ -396,9 +396,9 @@ class GRUCPUKernel : public framework::OpKernel<T> {
               frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
         }
 
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
+        phi::funcs::detail::forward_reset_output(
+            phi::funcs::detail::forward::gru_resetOutput<T>(), gru_value,
+            frame_size, cur_batch_size, active_gate);
 
         if (gru_value.prev_out_value) {
           blas.GEMM_COMPUTE(
@@ -408,9 +408,9 @@ class GRUCPUKernel : public framework::OpKernel<T> {
               frame_size * 3);
         }
 
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node, origin_mode);
+        phi::funcs::detail::forward_final_output(
+            phi::funcs::detail::forward::gru_finalOutput<T>(), gru_value,
+            frame_size, cur_batch_size, active_node, origin_mode);
 
         gru_value.prev_out_value = gru_value.output_value;
       }
@@ -432,7 +432,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
         gru_value.gate_value = gate_t.data<T>();
         gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
+        phi::funcs::GRUUnitFunctor<DeviceContext, T>::compute(
             dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
             active_gate, origin_mode);
 
@@ -441,7 +441,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
     }
 #endif
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, hidden);
   }
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index 7d055240916f621d90a3496ee241d9348e88b71d..5be0acc15432c896872a70e0a87949faea496a42 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -65,7 +65,7 @@ class GRUKernel : public framework::OpKernel<T> {
     batch_hidden->mutable_data<T>(context.GetPlace());
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
@@ -75,7 +75,7 @@ class GRUKernel : public framework::OpKernel<T> {
     }
 
     int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
@@ -96,9 +96,9 @@ class GRUKernel : public framework::OpKernel<T> {
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
+    auto active_node = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
+    auto active_gate = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("gate_activation"));
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -111,13 +111,13 @@ class GRUKernel : public framework::OpKernel<T> {
       gru_value.output_value = hidden_t.data<T>();
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
+      phi::funcs::GRUUnitFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
           active_gate, origin_mode);
       gru_value.prev_out_value = gru_value.output_value;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, hidden);
   }
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 130b10c7390110770336099c3ac64966389441eb..852655034c8c277f7e7bf1fb562951c26223c101 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +32,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
                              framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
@@ -63,7 +63,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
     int frame_size = hidden_dims[1];
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
     batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
@@ -93,12 +93,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
     batch_hidden_grad.set_lod(batch_hidden->lod());
     to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse);
 
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
-    math::GRUMetaGrad<T> gru_grad;
+    phi::funcs::GRUMetaGrad<T> gru_grad;
     if (weight_grad) {
       gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
@@ -112,9 +112,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_hidden_grad.lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
+    auto active_node = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
+    auto active_gate = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("gate_activation"));
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -145,13 +145,13 @@ class GRUGradKernel : public framework::OpKernel<T> {
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
       gru_value.output_value = nullptr;
-      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
+      phi::funcs::GRUUnitGradFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
           active_gate, origin_mode);
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+      phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
       to_seq(dev_ctx, batch_gate_grad, input_grad);
     }
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc
index f8f8f3fd789ad61a99bcc17bc073b6cfd099f639..524f2d6c9d719468876d8a586b6eea13f99a7b79 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cc
+++ b/paddle/fluid/operators/gumbel_softmax_op.cc
@@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::GumbelSoftmaxInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
                             GumbelSoftmaxGradInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxGradInferMeta));
+                            PD_INFER_META(phi::GumbelSoftmaxGradInferMeta));
 
 REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp,
                   ops::GumbelSoftmaxOpMaker,
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 3915ce5809c394738c58e80accccac531c268c23..3c9bbc753f29b1cf104a085d340ddc75cf2790f8 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
-                            PT_INFER_META(phi::HuberLossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PD_INFER_META(phi::HuberLossInferMeta));
 
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 33b68d68992dd819f74c2ae67153ecc6b050b16b..16968876ac96cac2fa1b009ea40b939f1e11a953 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 105d818e197434c4ed85126228e06d45bf06e498..e2efaa1759b008dd0055bb6e06917cbd4fc1932f 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
-                            PT_INFER_META(phi::IncrementInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
+                            PD_INFER_META(phi::IncrementInferMeta));
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
                   ops::IncrementGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index 09f4e63943ad3784a598524273831bf875ed9213..8324a6215bca8145ba36dabb3d8108006a57e829 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index 2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f..bb26e2f445e7034b8f982594216eacfd3007a24f 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace paddle {
@@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   int numel = out->numel();
   T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
   if (numel <= 0) return;
-  int vec_size = paddle::platform::GetVectorizedSize(out_data);
+  int vec_size = phi::GetVectorizedSize(out_data);
 #ifdef PADDLE_WITH_XPU_KP
   int block = 64;
   int grid = 8;
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 2d97797cfec21ed50f0999fa13f8bb1ae9618b71..d17c6368c7537b93ceb6f1d75b6d73467bd207ac 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_sample_op.h"
 #include <vector>
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
 class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class IndexSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Input) of FindByIndex should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Index) of FindByIndex should not be null."));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(X) shape of IndexSample op should be 2-D, but "
-            "got X's shape = [%s], please check X shape.",
-            input_dims));
-
-    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(Index) shape of IndexSample op should be 2-D, but "
-            "got Index's shape [%s] , please check index shape.",
-            input_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Inputs(X)'s value of dimension 0 must same with "
-                            "Inputs(Index)'s value of dimension 0, but "
-                            "got %d of Inputs(X), and got %d of Inputs(Index), "
-                            "please check Inputs shape.",
-                            input_dims[0], index_dims[0]));
-    }
-    ctx->SetOutputDim("Out", index_dims);
-    auto type = ctx->GetInputsVarType("Index")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Index", /*->*/ "Out");
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                   ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
+                  IndexSampleInferShapeFunctor);
 REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
                   ops::IndexSampleGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
deleted file mode 100644
index e8acbfb8be990a422e5a16e8871d47f55af6620c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/index_sample_op.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_sample_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define PREDEFINED_BLOCK_SIZE_X 512
-#define PREDEFINED_BLOCK_SIZE 1024
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-namespace paddle {
-namespace operators {
-
-namespace {
-void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
-                          .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
-  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
-}
-}
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
-                                   T* out_data, size_t index_length,
-                                   size_t input_length, size_t batch_size) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
-    }
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
-                                const T* out_grad, size_t index_length,
-                                size_t input_length, size_t batch_size,
-                                bool same_data_in_row = true) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      if (same_data_in_row) {
-        platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
-                                out_grad[sample_idx]);
-      } else {
-        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
-      }
-    }
-  }
-}
-
-template <typename T>
-class IndexSampleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* index = ctx.Input<LoDTensor>("Index");
-    auto* output = ctx.Output<LoDTensor>("Out");
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    const auto* in_data = input->data<T>();
-    auto* out_data = output->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-
-    auto input_dim = input->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = input_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    int block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    }
-  }
-};
-
-template <typename T>
-class IndexSampleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<LoDTensor>("Index");
-
-    const auto* output_grad_data = output_grad->data<T>();
-    auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    auto input_num = input_grad->numel();
-    auto input_dim = input_grad->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = index_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-    bool same_data_in_index_row = index_length == 1 ? false : true;
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    auto block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h
deleted file mode 100644
index 6cc8ff04c544554e805c605783c9bedf1b9fcb7b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/index_sample_op.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T, typename IndexT = int>
-void IndexSampleInner(const framework::ExecutionContext &context,
-                      const LoDTensor &input, const LoDTensor &index,
-                      LoDTensor *output) {
-  auto input_dims = input.dims();
-  auto index_dims = index.dims();
-
-  int batch_size = input_dims[0];
-  auto value_length = input_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> input_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(input, context.device_context(),
-                                    &input_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  std::vector<T> res(index_ids_num);
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    T v = input_vec[v_i];
-    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
-            << " value = " << v;
-    res[i] = v;
-  }
-
-  auto ddim = phi::make_ddim({batch_size, index_length});
-  output->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(res, context.device_context(), output);
-  output->Resize(ddim);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input_var = ctx.InputVar("X");
-    auto *index_var = ctx.InputVar("Index");
-
-    auto &input_tensor = input_var->Get<LoDTensor>();
-    auto &index_tensor = index_var->Get<LoDTensor>();
-
-    auto *out_var = ctx.OutputVar("Out");
-    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
-    }
-  }
-};
-
-template <typename T, typename IndexT = int>
-void IndexSampleGradInner(const framework::ExecutionContext &context,
-                          const LoDTensor &out_grad, const LoDTensor &index,
-                          LoDTensor *x_grad) {
-  std::vector<T> out_grad_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(out_grad, context.device_context(),
-                                    &out_grad_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  auto index_dims = index.dims();
-  auto x_grad_dims = x_grad->dims();
-
-  auto value_length = x_grad_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> x_grad_vec(x_grad->numel(), 0);
-
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    x_grad_vec[v_i] += out_grad_vec[i];
-  }
-  x_grad->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
-  x_grad->Resize(x_grad_dims);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *index_var = context.InputVar("Index");
-    auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
-
-    auto &index_tensor = index_var->Get<LoDTensor>();
-    auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
-    auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
-                                   x_grad_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
-                                       x_grad_tensor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
index f460d0622bccc2e71b1e147c0c9add688c3b11c4..38eb5b4514993412fa3a6c96ccc92e899c57b205 100644
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_sample_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index e0779249c41adc5005bbaba6e19127d2ced3a9ec..7f5136969980b887bb7bbe013690898e66abeac1 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
-    : public paddle::operators::BatchNormKernel<DeviceContext, T> {
+class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
@@ -213,7 +214,33 @@ class InplaceABNKernel
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    BatchNormKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+        is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+        mean_out, variance_out, saved_mean, saved_variance, reserve_space);
 
     auto cur_y = EigenVector<T>::Flatten(*y);
     InplaceABNActivation<DeviceContext, T> functor;
@@ -222,8 +249,7 @@ class InplaceABNKernel
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
-    : public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Input<Tensor>("Y");
@@ -244,7 +270,52 @@ class InplaceABNGradKernel
     InplaceABNActivation<DeviceContext, T> functor;
     functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
 
-    BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+    paddle::optional<const Tensor&> space_opt = paddle::none;
+    paddle::optional<const Tensor&> mean_opt = paddle::none;
+    paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+    if (reserve_space != nullptr) {
+      space_opt = *reserve_space;
+    }
+
+    if (mean != nullptr) {
+      mean_opt = *mean;
+    }
+
+    if (variance != nullptr) {
+      variance_opt = *variance;
+    }
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormGradRawKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+        mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+        use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+        scale_grad, bias_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index be7a7bd71711e379ef4d98eb1f9ac5ee2caaace6..db8f8c72d13f8e46f6f9e332c5c2f5164b6d0836 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -15,14 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/inplace_abn_op.h"
 #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
@@ -36,7 +37,33 @@ class InplaceABNKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* mean = ctx.Input<Tensor>("Mean");
+      auto* variance = ctx.Input<Tensor>("Variance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* mean_out = ctx.Output<Tensor>("MeanOut");
+      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+          is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+          mean_out, variance_out, saved_mean, saved_variance, reserve_space);
     }
 
     auto cur_y = EigenVector<T>::Flatten(*y);
@@ -49,8 +76,7 @@ class InplaceABNKernel
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
 class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* y = ctx.Input<Tensor>("Y");
@@ -74,7 +100,50 @@ class InplaceABNGradKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+      auto* mean = ctx.Input<Tensor>("ReserveSpace");
+      auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+      paddle::optional<const Tensor&> space_opt = paddle::none;
+      paddle::optional<const Tensor&> mean_opt = paddle::none;
+      paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+      if (reserve_space != nullptr) {
+        space_opt = *reserve_space;
+      }
+
+      if (mean != nullptr) {
+        mean_opt = *mean;
+      }
+
+      if (variance != nullptr) {
+        variance_opt = *variance;
+      }
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormGradRawKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+          mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+          use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+          scale_grad, bias_grad);
     }
   }
 };
diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h
index 1e061d8b50ae02f9b87f0a0976543467aa0b7dd0..31c22915ec5d052eb11c613d476f6aea541d8c47 100644
--- a/paddle/fluid/operators/inverse_op.h
+++ b/paddle/fluid/operators/inverse_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *input, output);
   }
 };
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 2750367dc773925e998507db4690e39c15f985d0..c835bb3cf60bfbf71b585828c74ac45f6bc91f8b 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/is_empty_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty");
-    ctx->SetOutputDim("Out", {1});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto *x = ctx.Input<framework::LoDTensor>("X");
@@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor,
+                            PD_INFER_META(phi::IsEmptyInferMeta));
 REGISTER_OPERATOR(
     is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsEmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc
deleted file mode 100644
index 3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/is_empty_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 735fffa7203b1213fccec0c4098048e85a6b24f8..cfa370ff9cb19dfb7d488b03cba52c115083cdc8 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel {
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
-    UnaryOpUnchangedInferShape(ctx);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -104,6 +101,14 @@ element of X as a tensor.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
 
 #define REGISTER_V2OP_MAKER(op_type, comment)           \
   namespace paddle {                                    \
@@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
 REGISTER_OPERATOR(
     isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsinfInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsnanInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(isnan_v2,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsfiniteInferShapeFunctor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
deleted file mode 100644
index 1b9f19d36dfa0f590f96577295ffb12e4456d2e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isfinite_v2_op.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(isnan_v2,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
index 4f30c58d375008abb3509989f90bcd9fec91fb38..f6f56f70f1a11971b31e679ef879f2d1d0a96085 100644
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/kthvalue_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index b31c7a1cde0f18edb00435805ce4b2a089f9eb1a..412ae3c49b5f3cc9fc2422aa220af324e6d99b69 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace paddle {
 namespace operators {
@@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     Vec x[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
       col += THREADS_PER_ROW;
     }
     U xf[LDGS * VecSize];
@@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr,
     T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr,
     T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   int col = c;
 #pragma unroll
   for (int it = 0; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     int col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      platform::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &dout[it]);
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &dout[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
       if (isFusedDropoutResidualLn) {
-        platform::Load<MaskType, VecSize>(
+        phi::Load<MaskType, VecSize>(
             mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
       }
 
@@ -474,11 +471,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     for (int it = 0; it < LDGS; it++) {
 #pragma unroll
       for (int jt = 0; jt < VecSize; jt++) {
-        U x_tmp = x[it][jt];
+        U x_tmp = static_cast<U>(x[it][jt]);
         U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
         U dy_tmp = static_cast<U>(gamma[it][jt]) *
-                   static_cast<U>(dout[it][jt]);  // scale * dy
-        U dout_tmp = dout[it][jt];                // dy
+                   static_cast<U>(dout[it][jt]);    // scale * dy
+        U dout_tmp = static_cast<U>(dout[it][jt]);  // dy
 
         // used for get dx (row reduction)
         sum_loss1 += dy_tmp;          // scale * dy, sum_1
@@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  dx_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize);
       if (isFusedDropoutResidualLn) {
-        platform::Store<T, VecSize>(
+        phi::Store<T, VecSize>(
             dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
       }
       col += THREADS_PER_ROW;
@@ -641,7 +637,7 @@ template <
 __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
     ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
-  using Vec = platform::AlignedVector<U, VecSize>;
+  using Vec = phi::AlignedVector<U, VecSize>;
   static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");
 
   const int tidx = threadIdx.x;
@@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     for (int row = r; row < rows; row += ROWS_PER_CTA) {
       Vec dg;
       Vec db;
-      platform::Load<U, VecSize>(dg_part_ptr, &dg);
-      platform::Load<U, VecSize>(db_part_ptr, &db);
+      phi::Load<U, VecSize>(dg_part_ptr, &dg);
+      phi::Load<U, VecSize>(db_part_ptr, &db);
       dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
       db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
 
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index d439b3220d96ecd1107d6c29850d3d5356a01e09..dfe73d3727132ae9b8f71e2a415ef5193f303493 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>);
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::bfloat16>);
 #else
 REGISTER_OP_CUDA_KERNEL(
     layer_norm,
diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc
index 0aaefc7ca75eb0f98e35200f0a1940aae07315b2..5e053445379118b37c9b0e0bdcb01adaec65b6c1 100644
--- a/paddle/fluid/operators/lerp_op.cc
+++ b/paddle/fluid/operators/lerp_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,49 +23,6 @@ namespace operators {
 class LerpOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lerp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "lerp");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "lerp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "lerp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto w_dims = ctx->GetInputDim("Weight");
-    framework::DDim out_dims;
-    out_dims = GetOutputDims(x_dims, y_dims);
-    if (w_dims.size() > 1 || w_dims[0] != 1) {
-      out_dims = GetOutputDims(out_dims, w_dims);
-    }
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- private:
-  framework::DDim GetOutputDims(const framework::DDim& s_dims,
-                                const framework::DDim& l_dims) const {
-    if (s_dims.size() > l_dims.size()) {
-      return GetOutputDims(l_dims, s_dims);
-    }
-    std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
-    for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
-      int64_t s = s_dims[i];
-      int64_t l = l_dims[j];
-      if (s != l) {
-        if (l == 1) {
-          shapes[j] = s;
-        } else if (s != 1) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "The shape of tensor a %s:%d must match shape of tensor b "
-              "%s:%d.",
-              s_dims.to_str(), i, l_dims.to_str(), j));
-        }
-      }
-    }
-    return phi::make_ddim(shapes);
-  }
 };
 
 class LerpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -125,10 +85,12 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
+                            PD_INFER_META(phi::LerpInferMeta));
 REGISTER_OPERATOR(
     lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker,
     paddle::operators::LerpOpGradMaker<paddle::framework::OpDesc>,
     paddle::operators::LerpOpGradMaker<paddle::imperative::OpBase>,
-    paddle::operators::LerpInplaceInferer);
+    paddle::operators::LerpInplaceInferer, LerpInferShapeFunctor);
 
 REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp);
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index fe271fa5e893a750bdbbdc05ac4b7835205ebe66..378c7573d6129abc28bd53dd6f964e5c726cce34 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/linspace_op.h"
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace");
-
-    auto s_dims = ctx->GetInputDim("Start");
-    PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Start) must be [1],"
-                          "but received input shape is [%s].",
-                          s_dims));
-    auto e_dims = ctx->GetInputDim("Stop");
-    PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Stop) must be [1],"
-                          "but received input shape is [%s].",
-                          e_dims));
-    auto step_dims = ctx->GetInputDim("Num");
-    PADDLE_ENFORCE_EQ(
-        (step_dims.size() == 1) && (step_dims[0] == 1), true,
-        platform::errors::InvalidArgument("The shape of Input(Num) must be [1],"
-                                          "but received input shape is [%s].",
-                                          step_dims));
-    ctx->SetOutputDim("Out", {-1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
-REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
-                       ops::CPULinspaceKernel<int32_t>,
-                       ops::CPULinspaceKernel<int64_t>,
-                       ops::CPULinspaceKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor,
+                            PD_INFER_META(phi::LinspaceInferMeta));
+REGISTER_OPERATOR(
+    linspace, ops::LinspaceOp, ops::LinspaceOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    LinspaceInferShapeFunctor);
 
 REGISTER_OP_VERSION(linspace)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
deleted file mode 100644
index aa625a7f5b9df0aa76872c56a3769f1186125bf5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
-                               T* out) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; index < size; index += blockDim.x * gridDim.x) {
-    if (index < size / 2) {
-      out[index] = static_cast<T>(start + step * index);
-    } else {
-      out[index] = static_cast<T>(stop - step * (size - index - 1));
-    }
-  }
-}
-
-template <typename T>
-__global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = static_cast<T>(start);
-}
-
-template <typename T>
-class CUDALinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    auto* num_t = context.Input<framework::Tensor>("Num");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    framework::Tensor n_start;
-    framework::Tensor n_stop;
-    framework::Tensor n_num;
-    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
-    T start = n_start.data<T>()[0];
-    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
-    T stop = n_stop.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
-    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
-
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    double step = 0;
-    auto stream = context.cuda_device_context().stream();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
-                                                    out_data);
-    } else {
-      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
-                        ops::CUDALinspaceKernel<int32_t>,
-                        ops::CUDALinspaceKernel<int64_t>,
-                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
deleted file mode 100644
index ae51f1221cc09b433e784ecaf52da69e41fc3706..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CPULinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    T start = start_t.data<T>()[0];
-    T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    if (num > 1) {
-      // step should be of double type for all types
-      double step = (static_cast<double>(stop - start)) / (num - 1);
-      int half_num = num / 2;
-      for (int i = 0; i < num; ++i) {
-        if (i < half_num) {
-          out_data[i] = static_cast<T>(start + step * i);
-        } else {
-          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
-        }
-      }
-    } else {
-      out_data[0] = static_cast<T>(start);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index df4d0ebbccd5e3fb4dd6131fb5fbcaa9056bd9d6..883e3597d8a31138a6ff1e4cfcb05a165eafc4a6 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,43 +24,6 @@ namespace operators {
 class LogLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime() ||
-        (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims, label_dims,
-          platform::errors::InvalidArgument(
-              "The dimensions of Input(Predicted) must be equal to the"
-              "dimensions of Input(Labels), but received dimensions of "
-              "Input(Predicted)"
-              "is [%s], received dimensions of Input(Labels) is [%s].",
-              pred_dims, label_dims));
-    }
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(Predicted) must be 2,"
-                          "But received dimensions of Input(Predicted)"
-                          "is [%d]",
-                          pred_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "Each row of Input(Predicted) contains a real value, "
-              "so the 2nd dimension of Input(X) must be 1,"
-              "But got [%d]",
-              pred_dims[1]));
-    }
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Predicted", "Loss");
-  }
 };
 
 template <typename AttrType>
@@ -145,17 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor,
+                            PD_INFER_META(phi::LogLossInferMeta));
 REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
                   ops::LogLossGradMaker<paddle::framework::OpDesc>,
-                  ops::LogLossGradMaker<paddle::imperative::OpBase>);
+                  ops::LogLossGradMaker<paddle::imperative::OpBase>,
+                  LogLossInferShapeFunctor);
 REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
deleted file mode 100644
index e7985ab810b138da62390fae29eb4a6cf638c897..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_loss_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* loss_out = ctx.Output<Tensor>("Loss");
-
-    loss_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
-        place, loss, prediction, label, epsilon);
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-
-    auto dl = EigenVector<T>::Flatten(*dloss);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    if (dpred) {
-      dpred->mutable_data<T>(ctx.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
-          place, dx, dl, prediction, label, epsilon);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index 9775910bba5cf30096f395c20d9dff3b5b1e541f..f103a69707a214400bbe2734409df4d9de3902e8 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index b2e68e9870d3c4f240fe35a4cbec811aefbc13f1..aa5fdd86745d6932052347f3dc11b14e3d447d20 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -10,11 +10,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 034e67568b34cebdfeddb884345b21cd99afb34f..8770abdac838f63b0c9f3a95b1ac0283a80ecbf2 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include <limits>
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
@@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
 template <typename T>
 class LogSoftmaxKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
 template <typename T>
 class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -468,16 +468,18 @@ class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
   }
 };
 
-}  // operators
-}  // paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(
     log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 62f9cd26c418399ac967e62a17d48f0c470b1ae7..4ec3072a96d445805f482060585a888a2a165413 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +31,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
                              framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
@@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
@@ -80,7 +80,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
-    math::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaValue<T> lstm_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmMetaValue will be updated later.
@@ -121,11 +121,11 @@ class LSTMKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
 
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
@@ -166,13 +166,13 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
       T cell_clip = 0.0;
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
           gate_act, cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_hidden, hidden_out);
@@ -241,7 +241,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
                 ") should be %d, but received %d in LSTM@Grad operator.",
             frame_size, out_dims[1]));
 
-    math::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaValue<T> lstm_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       lstm_value.check_ig = bias_data + 4 * frame_size;
@@ -253,7 +253,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_value.check_og = nullptr;
     }
 
-    math::LstmMetaGrad<T> lstm_grad;
+    phi::funcs::LstmMetaGrad<T> lstm_grad;
 
     if (bias && bias_g) {
       bias_g->mutable_data<T>(ctx.GetPlace());
@@ -270,7 +270,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
         const DeviceContext& ctx, const framework::LoDTensor& src,
@@ -293,11 +293,11 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
 
     auto batch_starts = batch_gate->lod()[0];
@@ -338,7 +338,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;
       T cell_clip = 0.0;
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           cell_clip, gate_act, cell_act, cand_act);
 
@@ -369,7 +369,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 96c074f1efb418a872b65b08affc7bdb0ed6a02f..5d24c0b70d3477224e89ca47924816e14abc5c56 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -18,12 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -72,7 +72,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
                              framework::Vector<size_t> index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, dst, indexed_src);
 }
@@ -81,15 +81,15 @@ template <typename DeviceContext, typename T>
 class LSTMPKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
-  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
-                  X x, Y y, platform::Place place) const {
-    if (act_type == math::detail::ActivationType::kIdentity) {
+  void ActCompute(const phi::funcs::detail::ActivationType act_type,
+                  const Device& d, X x, Y y, platform::Place place) const {
+    if (act_type == phi::funcs::detail::ActivationType::kIdentity) {
       y.device(d) = x;
-    } else if (act_type == math::detail::ActivationType::kSigmoid) {
+    } else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) {
       SigmoidFunctor<T>()(d, x, y);
-    } else if (act_type == math::detail::ActivationType::kTanh) {
+    } else if (act_type == phi::funcs::detail::ActivationType::kTanh) {
       TanhFunctor<T>()(d, x, y);
-    } else if (act_type == math::detail::ActivationType::kReLU) {
+    } else if (act_type == phi::funcs::detail::ActivationType::kReLU) {
       if (place == platform::CPUPlace())
         ReluCPUFunctor<T>()(d, x, y);
       else
@@ -120,7 +120,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
@@ -137,7 +137,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
-    math::LstmMetaValue<T> lstmp_value;
+    phi::funcs::LstmMetaValue<T> lstmp_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmpMetaValue will be updated later.
@@ -176,13 +176,13 @@ class LSTMPKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
+    auto proj_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
@@ -222,13 +222,13 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.output_value = hidden_t.data<T>();
       lstmp_value.state_value = cell_t.data<T>();
       lstmp_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
           gate_act, cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
       blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
                   &proj_t, static_cast<T>(0.0));
-      if (proj_act != math::detail::ActivationType::kIdentity) {
+      if (proj_act != phi::funcs::detail::ActivationType::kIdentity) {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev, ctx.GetPlace());
       }
@@ -242,7 +242,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_proj.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_proj, proj_out);
@@ -257,16 +257,16 @@ template <typename DeviceContext, typename T>
 class LSTMPGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const math::detail::ActivationType act_type,
+  void ActGradCompute(const phi::funcs::detail::ActivationType act_type,
                       const Device& d, X x, Y y, DX dx, DY dy) const {
     // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == math::detail::ActivationType::kIdentity)
+    if (act_type == phi::funcs::detail::ActivationType::kIdentity)
       dx.device(d) = dy;
-    else if (act_type == math::detail::ActivationType::kSigmoid)
+    else if (act_type == phi::funcs::detail::ActivationType::kSigmoid)
       SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kTanh)
+    else if (act_type == phi::funcs::detail::ActivationType::kTanh)
       TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kReLU)
+    else if (act_type == phi::funcs::detail::ActivationType::kReLU)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
       PADDLE_THROW(
@@ -340,7 +340,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                           "but received %d in LSTMP@Grad operator.",
                           frame_size, out_dims[1]));
 
-    math::LstmMetaValue<T> lstmp_value;
+    phi::funcs::LstmMetaValue<T> lstmp_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       lstmp_value.check_ig = bias_data + 4 * frame_size;
@@ -352,7 +352,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_value.check_og = nullptr;
     }
 
-    math::LstmMetaGrad<T> lstmp_grad;
+    phi::funcs::LstmMetaGrad<T> lstmp_grad;
 
     if (bias && bias_g) {
       bias_g->mutable_data<T>(ctx.GetPlace());
@@ -369,7 +369,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
         const DeviceContext& ctx, const framework::LoDTensor& src,
@@ -393,13 +393,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
+    auto proj_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
@@ -423,7 +423,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
               _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
       }
 
-      if (proj_act != math::detail::ActivationType::kIdentity) {
+      if (proj_act != phi::funcs::detail::ActivationType::kIdentity) {
         auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
         auto proj_g_dev = EigenMatrix<T>::From(proj_g);
         ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
@@ -470,7 +470,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_value.output_value = nullptr;
       lstmp_grad.state_active_grad = nullptr;
 
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
           cell_clip, gate_act, cell_act, cand_act);
 
@@ -503,7 +503,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index a4c3d1c81fb3e32aed506381ea1e6fdbdc5066ba..3cbbc62e7bec92f329535e788f19d439c9341a0e 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -46,7 +46,7 @@ template <typename DeviceContext, typename T>
 class LstsqCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
 
     const Tensor& x = *context.Input<Tensor>("X");
     auto y = context.Input<Tensor>("Y");
@@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
                               &rank_32, &wkopt, lwork, &rwkopt, &info);
     }
 
-    lwork = std::max<int>(1, static_cast<int>(phi::funcs::Real<T>(wkopt)));
+    lwork = std::max<int>(1, static_cast<int>(phi::dtype::Real<T>(wkopt)));
     Tensor work;
     work.Resize(phi::make_ddim({lwork}));
     T* work_data = work.mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index ac6566a87030d4c9cf613134cfe85c379fea5e20..31a98d9f630e1c01f3b886cbe91dd3882b384d05 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_subdirectory(detail)
-
 if (WITH_ASCEND_CL)
   cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner)
 endif()
@@ -7,6 +5,8 @@ endif()
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
 math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
+elseif (WITH_MLU)
+math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
 math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
@@ -18,8 +18,7 @@ math_library(im2col)
 math_library(sample_prob)
 math_library(sampler DEPS generator)
 
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
+# math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
 math_library(pooling)
 
@@ -29,7 +28,6 @@ else()
     math_library(selected_rows_functor DEPS selected_rows_utils math_function blas)
 endif()
 
-math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
@@ -48,8 +46,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(matrix_inverse)
-math_library(segment_pooling)
 math_library(matrix_solve)
 
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
@@ -74,7 +70,6 @@ if(WITH_GPU AND (NOT WITH_ROCM))
     endif()
 endif()
 
-cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if(WITH_TESTING AND TEST im2col_test)
     set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
 endif()
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 46126ac59c892787d2f63956983404843e518ae7..c9308d27c0a3490d9c0094f45a1a9c2d894bbf57 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
@@ -226,6 +229,90 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
 };
 #endif
 
+#ifdef PADDLE_WITH_MLU
+template <typename T>
+class ConcatFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, int axis,
+                  framework::Tensor* output) {
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto ins_size = input.size();
+
+    const int axis_t = axis;
+    const int ins_size_t = ins_size;
+    auto place = context.GetPlace();
+    output->mutable_data<T>(place);
+
+    // mlu should do sth
+    // init ins tensors
+    std::vector<const void*> inputs;
+    std::vector<MLUCnnlTensorDesc> input_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < ins_size; i++) {
+      input_descs.emplace_back(MLUCnnlTensorDesc(
+          input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
+      desc_vector.push_back(input_descs.back().get());
+      inputs.push_back(input[i].data());
+    }
+    // init out tensors
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(),
+                    inputs.data(), output_desc.get(), GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class SplitFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
+    if (input.numel() == 0) {
+      return;
+    }
+
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto in_dims = input.dims();
+    auto out_size = outputs->size();
+
+    std::vector<framework::DDim> outs_dims(out_size, in_dims);
+    for (size_t i = 0; i < out_size; ++i) {
+      outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
+    }
+
+    // init out tensors
+    std::vector<void*> vct_tensor;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < out_size; i++) {
+      (*outputs)[i]->Resize(outs_dims[i]);
+      (*outputs)[i]->mutable_data<T>(context.GetPlace());
+      output_descs.emplace_back(
+          MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
+                            ToCnnlDataType((*outputs)[i]->dtype())));
+      desc_vector.push_back(output_descs.back().get());
+      vct_tensor.push_back(GetBasePtr((*outputs)[i]));
+    }
+    // init in tensors
+    MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input.dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(),
+                   desc_vector.data(), vct_tensor.data());
+  }
+};
+#endif
+
 #define DEFINE_FUNCTOR(type)                                      \
   template class ConcatFunctor<platform::CPUDeviceContext, type>; \
   template class SplitFunctor<platform::CPUDeviceContext, type>;
@@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float)
 FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
 #endif
 
+#ifdef PADDLE_WITH_MLU
+#define DEFINE_MLU_FUNCTOR(type)                                  \
+  template class ConcatFunctor<platform::MLUDeviceContext, type>; \
+  template class SplitFunctor<platform::MLUDeviceContext, type>;
+DEFINE_MLU_FUNCTOR(float)
+DEFINE_MLU_FUNCTOR(platform::float16)
+DEFINE_MLU_FUNCTOR(int64_t)
+DEFINE_MLU_FUNCTOR(bool)
+DEFINE_MLU_FUNCTOR(int)
+DEFINE_MLU_FUNCTOR(int8_t)
+DEFINE_MLU_FUNCTOR(int16_t)
+DEFINE_MLU_FUNCTOR(uint8_t)
+#endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
deleted file mode 100644
index e41f0aedf39ef582b4533b1eeb6ccda1e8ed7e49..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using DataLayout = framework::DataLayout;
-
-/*
- * \brief Compute the depthwise convolution which include
- * forward process and backpropagation process
- */
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvInputGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* input_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFilterGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 9b6ebf73d9b09390edb16545d982010eb8692db0..1ade2190bb96e092ad546ace121192a87c8082ff 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -63,7 +63,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto dito =
@@ -123,7 +123,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
     for (auto i = 0; i < batch_size; i++) {
       auto *value_data = out_value + i * values_stride;
       auto *input_data = input_vector + i * vector_stride;
-      phi::funcs::lapackEigh<T, phi::funcs::Real<T>>(
+      phi::funcs::lapackEigh<T, phi::dtype::Real<T>>(
           jobz, uplo, n, input_data, lda, value_data, work_data, lwork,
           rwork_data, lrwork, iwork_data, liwork, &info);
       CheckEighResult(i, info);
@@ -151,7 +151,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -233,7 +233,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     }
   }
 
-  using ValueType = phi::funcs::Real<T>;
+  using ValueType = phi::dtype::Real<T>;
   inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
                         cublasFillMode_t uplo, int n, const T *A, int lda,
                         const ValueType *W, int *lwork) const;
diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
deleted file mode 100644
index 70cbfecefc8026f7603e095a53440daeffa29851..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/gru_compute.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  const T *gate_weight;
-  const T *state_weight;
-  const T *reset_bias;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  const T *prev_out_value;
-};
-
-template <typename T>
-struct GRUMetaGrad {
-  T *gate_weight_grad;
-  T *state_weight_grad;
-  T *gate_grad;
-  T *reset_output_grad;
-  T *output_grad;
-  T *prev_out_grad;
-  T *bias_hh_grad;
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitGradFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitFunctorV2 {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitGradFunctorV2 {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 38692a646111ec468de3fae6df619b33d9b9c8d5..9994ccc10cb13b2f692b18f16182f6bcdad7efa5 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
                                                        size_t num_rows,
                                                        size_t row_size, T init,
                                                        BinaryOp op) {
-  using RealT = phi::funcs::Real<T>;
+  using RealT = phi::dtype::Real<T>;
   constexpr auto kSharedBufferSize =
       framework::IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
   __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
deleted file mode 100644
index aa4fe65a5201c2db5684ac9407a869834f0eb757..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/lstm_compute.h"
-
-#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(context, detail::forward::lstm<T>(), value,
-                               frame_size, cell_clip, cand_act, gate_act,
-                               cell_act, old_api_version);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(context, detail::backward::lstm<T>(), value,
-                                grad, frame_size, cell_clip, cand_act, gate_act,
-                                cell_act, old_api_version);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
-template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
deleted file mode 100644
index 4342cb7b79928eb19901a1efa084a3d1d1fbda43..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cell_clip, cand_act,
-                                gate_act, cell_act);
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cell_clip, cand_act,
-                              gate_act, cell_act);
-  }
-};
-
-template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
deleted file mode 100644
index 1b36e615c68df814015a2c308ed74b755f6bc635..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "Eigen/Core"
-#include "Eigen/LU"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
-  }
-};
-
-template class MatrixInverseFunctor<platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
deleted file mode 100644
index 41335a69417a94a567119bb8f37378af957be541..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class MatrixInverseFunctor;
-
-template <typename T>
-class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-#ifndef PADDLE_WITH_HIP
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    memory::allocation::AllocationPtr tmp_gpu_mat_data;
-    const T* gpu_mat = a.data<T>();
-    if (n >= 32) {
-      // Copy all elements of input matrix A to a temporary memory space to
-      // avoid being overriden by getrf.
-      tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
-      memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(),
-                   context.GetPlace(), a.data(), a.numel() * sizeof(T),
-                   context.stream());
-      gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
-    }
-
-    std::vector<const T*> cpu_ptrs(batch_size * 2);
-    for (int i = 0; i < batch_size; ++i) {
-      cpu_ptrs[i] = gpu_mat + i * n * n;
-      cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
-    }
-
-    // Copy the addresses of A and A_inv from host to device.
-    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                 platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                 cpu_ptrs.size() * sizeof(T*), context.stream());
-    T** gpu_inv_ptrs =
-        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-    // Allocate device memory for info and pivots.
-    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-    memory::allocation::AllocationPtr tmp_gpu_info_data =
-        memory::Alloc(context, num_ints * sizeof(int));
-    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-
-    std::vector<int> info;  // only for singular checking
-    info.resize(batch_size);
-    // This functions in cuBLAS is intended to be used for matrices of small
-    // sizes where the launch overhead is a significant factor.
-    // TODO(Xreki): call function in cusolver for large matrices.
-    if (n < 32) {
-      // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
-      // plus cublas<S/D>getriBatched.
-      // However it only works if N is less than 32. If not, we need to
-      // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
-      blas.BatchedMatInv(n,
-                         reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                         gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    } else {
-      // This function performs the LU factorization of each matrix A by the
-      // equation P * A = L * U. L and U are written back to original matrix A,
-      // and diagonal elements of L are discarded.
-      int* gpu_pivot_ptr =
-          reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
-      blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_info_ptr, batch_size);
-
-      blas.BatchedGETRI(n,
-                        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    }
-    memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(),
-                 gpu_info_ptr, sizeof(int) * batch_size, context.stream());
-    for (int i = 0; i < batch_size; ++i) {
-      PADDLE_ENFORCE_EQ(info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U. "
-                            "Please check the matrix value and change it to a "
-                            "non-singular matrix",
-                            i, info[i], info[i]));
-    }
-#else
-    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
-#endif
-  }
-};
-
-template class MatrixInverseFunctor<platform::CUDADeviceContext, float>;
-template class MatrixInverseFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 45556e97d1d7afb81d626c99b078cbc215c0195f..28ec3a871022f4b9ec4dce9d9310fd630f10e473 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -14,106 +14,107 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 // All tensors are in NCHW or NHWC format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          int input_idx, output_idx;
-          for (int ph = 0; ph < groups; ++ph) {
-            if (axis == 1) {
-              input_idx =
-                  (new_bindex + new_cindex) * groups + ph * fea_size + f;
-            } else {
-              input_idx = (new_bindex + f * output_channels + c) * groups + ph;
-            }
-            T x = input_data[input_idx];
-            ele = ele > x ? ele : x;
-          }
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+  int fea_size = input_height * input_width;
+  // c_size means the output size of each sample
+  int c_size = fea_size * output_channels;
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  for (int i = 0; i < batch_size; ++i) {
+    int new_bindex = c_size * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int new_cindex = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        T ele = static_cast<T>(-FLT_MAX);
+        int input_idx, output_idx;
+        for (int ph = 0; ph < groups; ++ph) {
           if (axis == 1) {
-            output_idx = new_bindex + new_cindex + f;
+            input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f;
           } else {
-            output_idx = new_bindex + f * output_channels + c;
+            input_idx = (new_bindex + f * output_channels + c) * groups + ph;
           }
-          output_data[output_idx] = ele;
+          T x = input_data[input_idx];
+          ele = ele > x ? ele : x;
         }
+        if (axis == 1) {
+          output_idx = new_bindex + new_cindex + f;
+        } else {
+          output_idx = new_bindex + f * output_channels + c;
+        }
+        output_data[output_idx] = ele;
       }
     }
   }
-};
+}
 
-template <class T>
-class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+  int fea_size = input_height * input_width;
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0, output_idx;
-          bool continue_match = true;
-          if (axis == 1) {
-            input_idx0 = (blen + clen) * groups + f;
-            output_idx = blen + clen + f;
-          } else {
-            input_idx0 = (blen + f * output_channels + c) * groups;
-            output_idx = blen + f * output_channels + c;
-          }
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int idx_offset = (axis == 1 ? fea_size * g : g);
-            int input_idx = input_idx0 + idx_offset;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
+  for (int i = 0; i < batch_size; ++i) {
+    int blen = fea_size * output_channels * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int clen = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        int input_idx0, output_idx;
+        bool continue_match = true;
+        if (axis == 1) {
+          input_idx0 = (blen + clen) * groups + f;
+          output_idx = blen + clen + f;
+        } else {
+          input_idx0 = (blen + f * output_channels + c) * groups;
+          output_idx = blen + f * output_channels + c;
+        }
+        for (int g = 0; g < groups && continue_match; ++g) {
+          int idx_offset = (axis == 1 ? fea_size * g : g);
+          int input_idx = input_idx0 + idx_offset;
+          if (input_data[input_idx] == output_data[output_idx]) {
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+            continue_match = false;
           }
         }
       }
     }
   }
-};
+}
 
 template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
 template class MaxOutFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::CPUContext, float>;
+template class MaxOutGradFunctor<phi::CPUContext, double>;
+template class MaxOutFunctor<phi::CPUContext, float>;
+template class MaxOutFunctor<phi::CPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 1856fb4eb48c73f96d4f6428ba890c821a61292c..1d0478db5ef4a80d955d1166ffa21ff39f6bd184 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = output->numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width, groups,
-        axis, output_data);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  int nthreads = output->numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, input_channels, input_height, input_width, groups,
+      axis, output_data);
+}
+
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = output.numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups, axis);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+  int nthreads = output.numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, output_data, output_grad_data, input_grad_data,
+      input_channels, input_height, input_width, groups, axis);
+}
 
 template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
@@ -157,6 +154,12 @@ template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 template class MaxOutFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::GPUContext, float>;
+template class MaxOutGradFunctor<phi::GPUContext, double>;
+
+template class MaxOutFunctor<phi::GPUContext, float>;
+template class MaxOutFunctor<phi::GPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 0d8372df8a2fec306f6091712c66d55d1e71216e..1f4964f7715426d2eab6168ae009ffbd40e1ff0a 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -30,7 +30,7 @@ class MaxOutFunctor {
                   const int axis = 1);
 };
 
-template <typename DeviceContext, class T>
+template <typename DeviceContext, typename T>
 class MaxOutGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index fcd5c06a6f310f8a23608a77f2d6b9098e99b33a..5ac39953462b5078aa663a7f39f5eb95c96bae7a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/operators/mkldnn/axpy_handler.h"
@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
     out.mutable_value()->mutable_data<T>(
         phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
       rows_to_id[merge_rows[i]] = i;
     }
 
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
+    auto* y_data = out.mutable_value()->data<T>();
+    auto* x_data = input.value().data<T>();
+    int xm = input_rows.size();
+    int ym = merge_rows.size();
     int n = input_width;
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_to_id[input_rows[i]];
-      auto r = xpu::add(context.x_context(), &input_data[i * input_width],
-                        &out_data[out_i * input_width],
-                        &out_data[out_i * input_width], n);
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                     XPUAPIErrorMsg[r]));
-    }
+
+    xpu::ctx_guard RAII_GUARD(context.x_context());
+    int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+    int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+    memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                 merge_rows.data(), ym * sizeof(int64_t));
+    memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                 input_rows.data(), xm * sizeof(int64_t));
+    int r =
+        xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                        x_rows_data, y_rows_data, xm, n, ym);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
   }
 
   void operator()(const platform::XPUDeviceContext& context,
@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
             {static_cast<int64_t>(merged_row_set.size()), input_width}),
         context.GetPlace());
 
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-
-    float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+    float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
       }
       auto& input_rows = input->rows();
 
+      auto* x_data = input->value().data<T>();
+      int xm = input_rows.size();
+      int ym = merge_rows.size();
       int n = input_width;
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        auto r = xpu::add(
-            context.x_context(), input->value().data<T>() + i * input_width,
-            &out_data[out_i * input_width], &out_data[out_i * input_width], n);
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                       XPUAPIErrorMsg[r]));
-      }
+
+      xpu::ctx_guard RAII_GUARD(context.x_context());
+      int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+      int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+      memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                   merge_rows.data(), ym * sizeof(int64_t));
+      memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                   input_rows.data(), xm * sizeof(int64_t));
+      int r =
+          xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                          x_rows_data, y_rows_data, xm, n, ym);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
     }
   }
 };
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 8563d8b05b186c025ecc4c970a400765adeb0c5d..a4678550cf7bd0d4aa2759d4887dddabed5f9ba4 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
 template struct MergeAdd<platform::CUDADeviceContext,
                          platform::complex<double>>;
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index fd879e9e6ffe72a2175acc2db98727f5ff39fbbb..83b124902ebb74e65af0a25e432ff6b488e5cee1 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<platform::bfloat16>;
+template class SoftmaxGradCUDNNFunctor<platform::bfloat16>;
+#endif
 
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
@@ -131,6 +135,10 @@ template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                               false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                               true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              true>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
@@ -139,9 +147,13 @@ template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
                                   platform::float16>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext,
+                                  platform::bfloat16>;
 
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, true>;
 template class SoftmaxFunctor<phi::GPUContext, float, false>;
 template class SoftmaxFunctor<phi::GPUContext, double, false>;
 template class SoftmaxFunctor<phi::GPUContext, float, true>;
@@ -149,6 +161,7 @@ template class SoftmaxFunctor<phi::GPUContext, double, true>;
 template class SoftmaxGradFunctor<phi::GPUContext, float>;
 template class SoftmaxGradFunctor<phi::GPUContext, double>;
 template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index d51d638e0c19f43f9b0a91adbac15dffcdf14588..9833b4447ec45376e04ad520315e88568f7991d8 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -156,6 +156,65 @@ class SoftmaxEigen<DeviceContext, platform::float16, is_test> {
   }
 };
 
+template <typename DeviceContext, bool is_test>
+class SoftmaxEigen<DeviceContext, platform::bfloat16, is_test> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* X, framework::Tensor* Y) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    auto logits = EigenMatrix<platform::bfloat16>::From(*X);
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*Y);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    }
+
+    softmax.device(*context.eigen_device()) = softmax.exp();
+    softmax.device(*context.eigen_device()) =
+        (softmax *
+         softmax.reshape(batch_axis_remain)
+             .sum(along_axis)
+             .inverse()
+             .broadcast(one_axis));
+  }
+};
+
 template <typename DeviceContext, typename T, bool is_test, typename Enable>
 void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
     const DeviceContext& context, const int axis_dim,
@@ -289,6 +348,38 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
   }
 };
 
+template <typename DeviceContext>
+class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
+    auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
+    auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
+
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    auto dot = (softmax * softmax_grad)
+                   .reshape(batch_axis_remain)
+                   .sum(along_class)
+                   .broadcast(one_axis);
+    logits_grad.device(*context.eigen_device()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
+
 template <typename DeviceContext, typename T, typename Enable>
 void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
     const DeviceContext& context, const int axis_dim,
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 42bf1f471deb5238fdb34dcd9284972930305f58..bc5a589ed6fb137c5013253a65971dcf80d4ac72 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace platform {
 class CPUDeviceContext;
@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <class T>
+class Vol2ColFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    // changed
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx;
+            if (data_layout != DataLayout::kNHWC) {
+              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                            input_width +
+                        w_pad;
+            } else {
+              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                            input_channels +
+                        c_in;
+            }
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
 /*
  * vol = [input_channels,input_depth, input_height, input_width]
  * col =
@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <class T>
+class Col2VolFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx;
+              if (data_layout != DataLayout::kNHWC) {
+                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                              input_width +
+                          w_pad;
+              } else {
+                vol_idx =
+                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                        input_channels +
+                    cIm;
+              }
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
 template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Vol2ColFunctor<phi::CPUContext, float>;
+template class Vol2ColFunctor<phi::CPUContext, double>;
+
 template class Col2VolFunctor<platform::CPUDeviceContext, float>;
 template class Col2VolFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<phi::CPUContext, float>;
+template class Col2VolFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0c84c4751e78e6bd02c4a988a7d3558962a0de5
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
@@ -0,0 +1,337 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+static void Mul(const framework::ExecutionContext& ctx, const Tensor& X,
+                const Tensor& Y, Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                  CNNL_NOT_PROPAGATE_NAN);
+  MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(&X),
+                    y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                    GetBasePtr(Out), ToCnnlDataType<T>(), alpha);
+}
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits<float>::epsilon(),
+                    platform::errors::InvalidArgument(
+                        "MLU(matmul): alpha should be equal to 1.0! "
+                        "Other values are not supported yet."
+                        "But received alpha is %d.",
+                        alpha));
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnl::Matmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X),
+                  y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                  GetBasePtr(Out));
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  if (!Out->initialized()) {
+    Out->mutable_data<T>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits<float>::epsilon(),
+                    platform::errors::InvalidArgument(
+                        "MLU(matmul): alpha should be equal to 1.0! "
+                        "Other values are not supported yet."
+                        "But received alpha is %d.",
+                        alpha));
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnl::BatchMatmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X),
+                       y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                       GetBasePtr(Out));
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& bcast_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = bcast_dims.size();
+  int64_t diff = bcast_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (bcast_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  std::vector<int> reduce_dims(axes.begin(), axes.end());
+  MLUCnnlReduceDesc reduce_desc(reduce_dims, CNNL_REDUCE_ADD,
+                                ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN,
+                                CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+  MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduce_desc.get(), nullptr,
+                  in_desc.get(), GetBasePtr(&in), 0 /*indices_size*/, nullptr,
+                  nullptr, out_desc.get(), GetBasePtr(out));
+}
+
+template <typename T>
+class MatMulMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* Out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+
+    // Case 1: [K] x [K] = [1]
+    // Equal: [1, K] x [K, 1] = [1, 1] => [1]
+    const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
+    if (all_one_dim) {
+      Out->Resize({1, 1});
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      x_temp.Resize(phi::make_ddim(x_dims));
+      x_ndim = 2;
+      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
+      if (out_dims.size() < y_dims.size()) {
+        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
+        temp_out_dims.insert(temp_out_dims.end() - 1, 1);
+        Out->Resize(phi::make_ddim(temp_out_dims));
+      }
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      y_temp.Resize(phi::make_ddim(y_dims));
+      y_ndim = 2;
+      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
+      if (out_dims.size() < x_dims.size()) {
+        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
+        temp_out_dims.push_back(1);
+        Out->Resize(phi::make_ddim(temp_out_dims));
+      }
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (transpose_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
+    }
+
+    if (x_ndim == 2 && y_ndim == 2) {
+      // Case 2: [M, K] x [K, N] = [M, N]
+      MatMul2D<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
+    } else {
+      // Case 3: [B, M, K] x [K, N] =  [B, M, N]
+      // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+      MatMulND<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
+    }
+
+    if (phi::vectorize(Out->dims()) != out_dims) {
+      Out->Resize(phi::make_ddim(out_dims));
+    }
+  }
+};
+
+template <typename T>
+class MatMulGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
+
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dX) {
+        Mul<T>(ctx, *dOut, *Y, dX, alpha);
+      }
+      if (dY) {
+        Mul<T>(ctx, *dOut, *X, dY, alpha);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(phi::make_ddim(x_dims));
+      dout_temp.Resize(phi::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(phi::make_ddim(y_dims));
+      dout_temp.Resize(phi::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(phi::make_ddim(x_dims));
+        if (transpose_x) {
+          MatMul2D<T>(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha);
+        } else {
+          MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha);
+        }
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        dY->Resize(phi::make_ddim(y_dims));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha);
+        } else {
+          MatMul2D<T>(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha);
+        }
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N]
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_bcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_bcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
+
+    if (dX) {
+      Tensor dx_temp(X->type());
+      if (x_dims != x_bcast_dims) {
+        dx_temp.Resize(phi::make_ddim(x_bcast_dims));
+      } else {
+        dX->mutable_data<T>(ctx.GetPlace());
+        dx_temp.ShareDataWith(*dX);
+      }
+
+      if (transpose_x) {
+        MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha);
+      } else {
+        MatMulND<T>(ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y,
+                    alpha);
+      }
+
+      if (x_dims != x_bcast_dims) {
+        ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
+      }
+    }
+
+    if (dY) {
+      Tensor dy_temp(Y->type());
+      if (y_dims != y_bcast_dims) {
+        dy_temp.Resize(phi::make_ddim(y_bcast_dims));
+      } else {
+        dY->mutable_data<T>(ctx.GetPlace());
+        dy_temp.ShareDataWith(*dY);
+      }
+
+      if (transpose_y) {
+        MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha);
+      } else {
+        MatMulND<T>(ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false,
+                    alpha);
+      }
+
+      if (y_dims != y_bcast_dims) {
+        ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(matmul, ops::MatMulMLUKernel<float>,
+                       ops::MatMulMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(matmul_grad, ops::MatMulGradMLUKernel<float>,
+                       ops::MatMulGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 788dbb2204109dd4f215730e4234e3fec8aef702..01fa01e3c6ed04c151f709dd5fbebe387c32bde3 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
                   ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
 
-DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
-                            PT_INFER_META(phi::GeneralBinaryGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
 REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad,
                   ops::MatMulV2OpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2OpDoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index 1524a50f1ac6d6afa67722bc5d1c16a581395bb2..87df75ac465042a0f7894abecb4be4c213e5d807 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
   auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
       ColumnMatrixFromVector(y_dims), 0, trans_y);
 
-  if (x_dims.size() == 3 && y_dims.size() <= 2) {
+  if (x_dims.size() >= 3 && y_dims.size() <= 2) {
     // if transpose_X is true, the transpose cost much time
     if (!trans_x) {
       mat_dim_a.height_ *= mat_dim_a.batch_size_;
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index c65af3129f3646163925be95b27b9fec25207f8c..cdf204628b638f877c92e35a8941487aa39b5427 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace operators {
@@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
deleted file mode 100644
index d2c67d80b4f5a562d47e56173ecf1ea2f99bff56..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matrix_power_op.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct IdentityMatrixFunctor {
-  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int row = index / m_ % m_;
-    const int col = index % m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
-                         const paddle::framework::ExecutionContext& ctx) {
-  const auto& x_dims = X->dims();
-  const int x_ndim = x_dims.size();
-  T* out_data = Out->mutable_data<T>(ctx.GetPlace());
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, X->numel());
-
-  if (n == 0) {
-    // Out = Identity Matrix
-    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
-    for_range(functor);
-    return;
-  }
-
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  if (new_n == 1) {
-    framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out);
-    return;
-  }
-
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (new_n == 2) {
-    // Out = newX * newX
-    Out->mutable_data<T>(ctx.GetPlace());
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 3) {
-    // Out = (newX * newX) * newX
-    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
-    // gemm operations must be computable independently; otherwise,
-    // undefined behavior is expected.
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 4) {
-    // Out = (newX * newX) * (newX * newX)
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  }
-
-  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
-  int bit = 0;
-  Tensor z = Tensor(X->dtype());
-  bool out_inited = false;
-  Tensor temp_out = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  Tensor temp_z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  while (new_n > 0) {
-    bit = new_n & 0x1;
-    new_n >>= 1;
-    if (z.IsInitialized()) {
-      blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                  &temp_z, static_cast<T>(0));
-      framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z);
-    } else {
-      z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-      framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z);
-    }
-    if (bit == 1) {
-      if (out_inited == true) {
-        blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                    &temp_out, static_cast<T>(0));
-        framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out);
-      } else {
-        framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out);
-        out_inited = true;
-      }
-    }
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    Tensor* Out = ctx.Output<Tensor>("Out");
-    int n = ctx.Attr<int>("n");
-
-    const auto& x_dims = X->dims();
-    const int x_ndim = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        x_dims[x_ndim - 2], x_dims[x_ndim - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) should be equal."
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            x_dims[x_ndim - 2], x_dims[x_ndim - 1]));
-
-    MatrixPowerFunction<DeviceContext, T>(X, n, Out, ctx);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
-                             const Tensor* dOut, const int n, Tensor* dX,
-                             const paddle::framework::ExecutionContext& ctx) {
-  dX->mutable_data<T>(ctx.GetPlace());
-  const auto& x_dims = X->dims();
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  if (n == 0) {
-    // \nabla X = O
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, dX, static_cast<T>(0));
-    return;
-  } else if (n == 1) {
-    // \nabla X = \nabla Out
-    framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX);
-    return;
-  }
-
-  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (n == -1) {
-    // \nabla X = Out^{T} * \nabla Out * Out^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast<T>(1), dX,
-                static_cast<T>(0));
-    return;
-  }
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  // Use chain rule blow to compute \nabla newX^{n}
-  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
-  // Note that newX^{0} can be omitted
-  std::vector<std::shared_ptr<Tensor>> tensor_list(new_n - 1);
-  tensor_list[0] = std::make_shared<Tensor>(new_x);
-  int index = 1;
-  while (index < new_n - 1) {
-    tensor_list[index] = std::make_shared<Tensor>(
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx));
-    blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc,
-                static_cast<T>(1), tensor_list[index].get(), static_cast<T>(0));
-    index++;
-  }
-
-  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
-  //                      * \nabla Out
-  //                      * (newX^{T}^{n - i - 1})
-  Tensor dx_new = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc,
-              static_cast<T>(1), &dx_new, static_cast<T>(0));
-  Tensor da_an_minus1 =
-      ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc,
-              static_cast<T>(1), &da_an_minus1, static_cast<T>(0));
-  blas.AXPY(X->numel(), static_cast<T>(1), da_an_minus1.data<T>(),
-            dx_new.data<T>());
-  int start = 0;
-  while (start < new_n - 2) {
-    Tensor a_da = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    Tensor a_da_a = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc,
-                static_cast<T>(1), &a_da, static_cast<T>(0));
-    blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start],
-                trans_desc, static_cast<T>(1), &a_da_a, static_cast<T>(0));
-    blas.AXPY(X->numel(), static_cast<T>(1), a_da_a.data<T>(),
-              dx_new.data<T>());
-    start++;
-  }
-
-  if (n > 0) {
-    // \nabla X = \nabla newX
-    framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX);
-  } else {
-    // \nabla X = newX^{T} * \nabla newX * newX^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast<T>(1),
-                dX, static_cast<T>(0));
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    const Tensor* Out = ctx.Input<Tensor>("Out");
-    const Tensor* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const int n = ctx.Attr<int>("n");
-    Tensor* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    MatrixPowerGradFunction<DeviceContext, T>(X, Out, dOut, n, dX, ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index 65599259e2237387ad0dd85b5a9772733e3d7a1a..1f04875c2203b2af80aa3cb81aaf95fbb0a6fe6c 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -224,15 +225,15 @@ class MatrixRankCPUKernel : public framework::OpKernel<T> {
 
     int axis = -1;
     if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
-      ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
+      ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
                            platform::CPUDeviceContext, T, int>(
           context, &eigenvalue_tensor, &tol_tensor, axis,
-          GreaterThanFunctor<T, int64_t>(), &compare_result);
+          phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
     } else {
-      ElementwiseComputeEx<LessThanFunctor<T, int64_t>,
+      ElementwiseComputeEx<phi::funcs::LessThanFunctor<T, int64_t>,
                            platform::CPUDeviceContext, T, int>(
           context, &eigenvalue_tensor, &tol_tensor, axis,
-          LessThanFunctor<T, int64_t>(), &compare_result);
+          phi::funcs::LessThanFunctor<T, int64_t>(), &compare_result);
     }
     auto dito_int =
         math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index b1800c9c0c9f8b68f7f62de42943f7a425fa0ddb..dccd716022d2ab74d3f6aa35aa70780ac4feba16 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -129,10 +130,10 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
     compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
                                          context.GetPlace());
     int axis = -1;
-    ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
+    ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
                          platform::CUDADeviceContext, T, int64_t>(
         context, &eigenvalue_tensor, &tol_tensor, axis,
-        GreaterThanFunctor<T, int64_t>(), &compare_result);
+        phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
     auto dito_int =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
                                                  int64_t>(context);
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/fluid/operators/matrix_rank_op.h
index 80774aa916920dd5c828498f4345bd85ea4f33f8..93545fd31037ada823d35af5b5bad809ebf3d773 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/fluid/operators/matrix_rank_op.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/controlflow/compare_op.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index bd9ebd29777def2fafca648ad80bc57bef8df316..e55369e0691ee5e36da76c53c6dd5d13288231f4 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -12,14 +12,14 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/fluid/operators/maxout_op.h"
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -130,10 +130,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
 REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc
deleted file mode 100644
index be1e81bb869a3a5144b72ef54af22f75b2146bc5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/maxout_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
deleted file mode 100644
index 922998293943ed5ee1ebcd08b5bcd93467496cb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/maxout_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MaxOutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
-    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
-                   groups, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MaxOutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
-      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups,
-                      axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 3692ace8bb5a46b06bd10a07a5d5d95d8825bdc6..32ef052119883944abc1876f8bf3a8c028ddc57a 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::NotFound("Input (Out) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true,
-                      platform::errors::NotFound(
-                          "Input (Indices) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true,
-                      platform::errors::NotFound(
-                          "Input (Label) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true,
-                      platform::errors::NotFound(
-                          "Output (Accuracy) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true,
-                      platform::errors::NotFound(
-                          "Output (Correct) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true,
-                      platform::errors::NotFound(
-                          "Output (Total) of AccuracyOp is not found."));
-
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy",
-                   "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy");
-
-    auto inference_dim = ctx->GetInputDim("Out");
-    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape as inference, because
-    // it's the output of topk.
-
-    PADDLE_ENFORCE_EQ(
-        label_dim.size(), 2,
-        platform::errors::InvalidArgument(
-            "ShapeError: label's dimensions of AccuracyOp must be 2. "
-            "But received label's dimensions = %d, label's shape = [%s]",
-            label_dim.size(), label_dim));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: label's second dimension of "
-                            "AccuracyOp must be 1. But received label's "
-                            "second dimension is = %d, label's shape = [%s]",
-                            label_dim[1], label_dim));
-      PADDLE_ENFORCE_EQ(
-          inference_dim[0], label_dim[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: the output's num_rows of AccuracyOp must be"
-              " the same as label's num_rows. But received output's "
-              "shape = [%s], label's shape = [%s], output's num_rows = %d, "
-              "label's "
-              "num_rows = %d",
-              inference_dim, label_dim, inference_dim[0], label_dim[0]));
-    }
-
-    ctx->SetOutputDim("Accuracy", {1});
-    ctx->SetOutputDim("Correct", {1});
-    ctx->SetOutputDim("Total", {1});
-    ctx->ShareLoD("Out", /*->*/ "Accuracy");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -123,13 +62,13 @@ with the input Out(Inference).
 }  // namespace operators
 }  // namespace paddle
 
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
+DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor,
+                            PD_INFER_META(phi::AccuracyInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int.
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AccuracyInferShapeFunctor);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
deleted file mode 100644
index 6f19100fa9d37e2efedad60a982bf19b09cac736..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/reduce.h>
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D,
-                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy, int* total_data) {
-  int count = 0;
-  __shared__ int total[BlockSize];
-
-  // support only 1 block
-  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-    for (int j = 0; j < D; ++j) {
-      if (Xdata[i * D + j] == labeldata[i]) {
-        ++count;
-        break;
-      }
-    }
-  }
-  total[threadIdx.x] = count;
-  __syncthreads();
-
-// reduce the count with init value 0, and output accuracy.
-#ifdef PADDLE_WITH_CUDA
-  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
-#else
-  // HIP thrust::reduce not support __device__
-  for (int s = BlockSize / 2; s > 0; s >>= 1) {
-    if (threadIdx.x < s) {
-      total[threadIdx.x] += total[threadIdx.x + s];
-    }
-    __syncthreads();
-  }
-  int result = total[0];
-#endif
-  if (threadIdx.x == 0) {
-    *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
-    *total_data = N;
-  }
-}
-
-template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-    // FIXME(typhoonzero): only support indices currently
-    // if add support for output values, how to detect the data type?
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    int num_samples = static_cast<int>(inference->dims()[0]);
-    size_t infer_width = inference->dims()[1];
-    auto stream = ctx.cuda_device_context().stream();
-    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    AccuracyCudaKernel<
-        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data, total_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-// FIXME(typhoonzero): types of T is for inference data.
-// label data is always int64
-REGISTER_OP_CUDA_KERNEL(
-    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-    paddle::operators::AccuracyOpCUDAKernel<double>,
-    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h
deleted file mode 100644
index 94e5bf8257e67b9fd01aa9ae45a25d90963fef13..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AccuracyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    size_t num_samples = inference->dims()[0];
-    size_t class_dim = inference->dims()[1];
-    *accuracy_data = 0.0f;
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    int num_correct = 0;
-    // assume inference is already the topk of the output
-    for (size_t i = 0; i < num_samples; ++i) {
-      PADDLE_ENFORCE_GE(
-          label_data[i], 0,
-          platform::errors::InvalidArgument(
-              "label of AccuracyOp must >= 0, But received label[%d] is %d", i,
-              label_data[i]));
-      for (size_t j = 0; j < class_dim; ++j) {
-        if (indices_data[i * class_dim + j] == label_data[i]) {
-          ++num_correct;
-          break;
-        }
-      }
-    }
-
-    *correct_data = num_correct;
-    *total_data = num_samples;
-    *accuracy_data =
-        static_cast<float>(num_correct) / static_cast<float>(num_samples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 2598d3b0277c94a52e1fa14b04c00b595071f312..1ce02ff4525c9692f88ed42b79ff336cc0113c41 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index 63bccc2e6e065a639c86a647894d2a0c124f0e54..9f2ca4165f33a28902bfe20207b12bad2af49fad 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -12,8 +12,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index de71312d78df99adc3b3663f2fcbb3943373982e..3cc1be4de8a82ff263824ab4852178f735596d45 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = paddle::framework::Tensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 2a3a0fa5d1fe50c93686c76571d812cab18c1d38..f3ed98c3f4d1e47a8b7dff81a998c7574859baa2 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/auc_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc");
-    auto predict_dims = ctx->GetInputDim("Predict");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_GE(
-        predict_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape size must be "
-            "greater_equal 2.",
-            predict_dims));
-    auto predict_width = predict_dims[1];
-    PADDLE_ENFORCE_NE(
-        phi::product(predict_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape can not involes 0.",
-            predict_dims));
-    PADDLE_ENFORCE_NE(
-        phi::product(label_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Label) has not been initialized properly. The "
-            "shape of Input(Label) = [%s], the shape can not involes 0.",
-            label_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_LE(predict_width, 2,
-                        platform::errors::InvalidArgument(
-                            "Only support binary classification,"
-                            "prediction dims[1] should be 1 or 2"));
-    }
-    auto predict_height = ctx->GetInputDim("Predict")[0];
-    auto label_height = ctx->GetInputDim("Label")[0];
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(predict_height, label_height,
-                        platform::errors::InvalidArgument(
-                            "Out and Label should have same height."));
-    }
-
-    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
-    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
-
-    PADDLE_ENFORCE_GE(
-        num_pred_buckets, 1,
-        platform::errors::InvalidArgument("num_thresholds must larger than 1"));
-    PADDLE_ENFORCE_GE(slide_steps, 0,
-                      platform::errors::InvalidArgument(
-                          "slide_steps must be natural number"));
-
-    ctx->SetOutputDim("AUC", {1});
-
-    if (slide_steps) {
-      ctx->SetOutputDim("StatPosOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-      ctx->SetOutputDim("StatNegOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-    } else {
-      ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets});
-      ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -145,5 +84,7 @@ There are two types of possible curves:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
-REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
+DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor,
+                            PD_INFER_META(phi::AucInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker,
+                             AucInferShapeFunctor);
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
deleted file mode 100644
index 1cb7eba8775e814b1150929de4a341c466ee4583..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/metrics/auc_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
-                                        const int bucket_length,
-                                        const int slide_steps) {
-  int cur_step_index =
-      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
-  int cur_step_begin = cur_step_index * bucket_length;
-  int sum_step_begin = slide_steps * bucket_length;
-  CUDA_KERNEL_LOOP(i, bucket_length) {
-    pos[sum_step_begin + i] -= pos[cur_step_begin + i];
-    neg[sum_step_begin + i] -= neg[cur_step_begin + i];
-    pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0;
-  }
-}
-
-__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg,
-                                    const int bucket_length,
-                                    const int slide_steps) {
-  int cur_step_index =
-      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
-  int cur_step_begin = cur_step_index * bucket_length;
-  int sum_step_begin = slide_steps * bucket_length;
-  CUDA_KERNEL_LOOP(i, bucket_length) {
-    pos[sum_step_begin + i] += pos[cur_step_begin + i];
-    neg[sum_step_begin + i] += neg[cur_step_begin + i];
-  }
-}
-
-template <typename T>
-__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data,
-                              const int inference_width,
-                              const int num_thresholds, int64_t *pos,
-                              int64_t *neg, const int numel,
-                              const int slide_steps) {
-  int cur_step_begin = 0;
-  if (slide_steps > 0) {
-    int cur_step_index =
-        static_cast<int>(pos[(slide_steps + 1) * (1 + num_thresholds)]) %
-        slide_steps;
-    cur_step_begin = cur_step_index * (1 + num_thresholds);
-  }
-  CUDA_KERNEL_LOOP(i, numel) {
-    auto predict_data = pred_data[i * inference_width + (inference_width - 1)];
-    PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1.");
-    PADDLE_ENFORCE(predict_data >= 0,
-                   "The predict data must gather or equal 0.");
-    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-    if (label_data[i]) {
-      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
-    } else {
-      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
-    }
-  }
-}
-__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg,
-                              int num_thresholds, double *auc,
-                              bool need_add_batch_num) {
-  *auc = 0.0f;
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-
-  int idx = num_thresholds;
-
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += stat_pos[idx];
-    totNeg += stat_neg[idx];
-    *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0;
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    *auc = *auc / totPos / totNeg;
-  }
-  if (need_add_batch_num) {
-    stat_pos[num_thresholds + 1] += 1;
-    stat_neg[num_thresholds + 1] += 1;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class AucCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc_tensor = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    auto *auc_value = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    auto *stat_pos_in_tensor = ctx.Input<Tensor>("StatPos");
-    auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
-    auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
-    auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
-#ifdef PADDLE_WITH_CUDA
-    if (stat_pos_in_tensor != stat_pos) {
-      cudaMemcpy(origin_stat_pos, pos_in_data,
-                 ((1 + slide_steps) * (num_thresholds + 1) +
-                  (slide_steps > 0 ? 1 : 0)) *
-                     sizeof(int64_t),
-                 cudaMemcpyDeviceToDevice);
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      cudaMemcpy(origin_stat_neg, neg_in_data,
-                 ((1 + slide_steps) * (num_thresholds + 1) +
-                  (slide_steps > 0 ? 1 : 0)) *
-                     sizeof(int64_t),
-                 cudaMemcpyDeviceToDevice);
-    }
-#else
-    if (stat_pos_in_tensor != stat_pos) {
-      hipMemcpy(origin_stat_pos, pos_in_data,
-                ((1 + slide_steps) * (num_thresholds + 1) +
-                 (slide_steps > 0 ? 1 : 0)) *
-                    sizeof(int64_t),
-                hipMemcpyDeviceToDevice);
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      hipMemcpy(origin_stat_neg, neg_in_data,
-                ((1 + slide_steps) * (num_thresholds + 1) +
-                 (slide_steps > 0 ? 1 : 0)) *
-                    sizeof(int64_t),
-                hipMemcpyDeviceToDevice);
-    }
-#endif
-
-    statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos,
-            origin_stat_neg);
-    int sum_offset = slide_steps * (num_thresholds + 1);
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    CalcAucKernel<<<1, 1, 0, stream>>>(
-        origin_stat_pos + sum_offset, origin_stat_neg + sum_offset,
-        num_thresholds, auc_value, slide_steps > 0);
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos,
-                             int64_t *origin_stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-    const int bucket_length = num_thresholds + 1;
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    if (slide_steps == 0) {
-      AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          label_data, inference_data, inference_width, num_thresholds,
-          origin_stat_pos, origin_stat_neg, batch_size, slide_steps);
-      return;
-    }
-    // the last number of origin_stat_pos store the index should be used in
-    // current step
-    int cur_step_index =
-        static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
-        slide_steps;
-    int cur_step_begin = cur_step_index * bucket_length;
-    int sum_step_begin = slide_steps * bucket_length;
-
-    ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
-                                  PADDLE_CUDA_NUM_THREADS,
-                              PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
-
-    AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
-                        PADDLE_CUDA_NUM_THREADS,
-                    PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        label_data, inference_data, inference_width, num_thresholds,
-        origin_stat_pos, origin_stat_neg, batch_size, slide_steps);
-    UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
-                              PADDLE_CUDA_NUM_THREADS,
-                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(auc,
-                        ops::AucCUDAKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
deleted file mode 100644
index 10403472c69b57723bc714703c115f07d8640f7e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AucKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc_tensor = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    auto *auc_value = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    // Just for pass UT, since UT's input & output connot be set same var
-    auto *stat_pos_in_tensor = ctx.Input<Tensor>("StatPos");
-    auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
-    auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
-    auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
-    if (stat_pos_in_tensor != stat_pos) {
-      memcpy(origin_stat_pos, pos_in_data,
-             ((1 + slide_steps) * (num_thresholds + 1) +
-              (slide_steps > 0 ? 1 : 0)) *
-                 sizeof(int64_t));
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      memcpy(origin_stat_neg, neg_in_data,
-             ((1 + slide_steps) * (num_thresholds + 1) +
-              (slide_steps > 0 ? 1 : 0)) *
-                 sizeof(int64_t));
-    }
-    statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos,
-            origin_stat_neg);
-
-    int sum_offset = slide_steps * (num_thresholds + 1);
-    calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset,
-            num_thresholds, auc_value);
-    if (slide_steps) {
-      origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1;
-      origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1;
-    }
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos,
-                             int64_t *origin_stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-    const int bucket_length = num_thresholds + 1;
-    if (slide_steps == 0) {
-      for (size_t i = 0; i < batch_size; i++) {
-        // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-        // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-        auto predict_data =
-            inference_data[i * inference_width + (inference_width - 1)];
-        PADDLE_ENFORCE_LE(predict_data, 1,
-                          platform::errors::PreconditionNotMet(
-                              "The predict data must less or equal 1."));
-        PADDLE_ENFORCE_GE(predict_data, 0,
-                          platform::errors::PreconditionNotMet(
-                              "The predict data must gather or equal 0."));
-
-        uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-        if (label_data[i] > 0) {
-          origin_stat_pos[binIdx] += 1;
-        } else if (label_data[i] == 0) {
-          origin_stat_neg[binIdx] += 1;
-        }
-      }
-      return;
-    }
-    // the last number of origin_stat_pos store the index should be used in
-    // current step
-    int cur_step_index =
-        static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
-        slide_steps;
-    int cur_step_begin = cur_step_index * bucket_length;
-    int sum_step_begin = slide_steps * bucket_length;
-    for (int i = 0; i < bucket_length; ++i) {
-      origin_stat_pos[sum_step_begin + i] -=
-          origin_stat_pos[cur_step_begin + i];
-      origin_stat_neg[sum_step_begin + i] -=
-          origin_stat_neg[cur_step_begin + i];
-    }
-
-    std::memset(origin_stat_pos + cur_step_begin, 0,
-                bucket_length * sizeof(int64_t));
-    std::memset(origin_stat_neg + cur_step_begin, 0,
-                bucket_length * sizeof(int64_t));
-
-    for (size_t i = 0; i < batch_size; i++) {
-      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-      auto predict_data =
-          inference_data[i * inference_width + (inference_width - 1)];
-      PADDLE_ENFORCE_LE(predict_data, 1,
-                        platform::errors::PreconditionNotMet(
-                            "The predict data must less or equal 1."));
-      PADDLE_ENFORCE_GE(predict_data, 0,
-                        platform::errors::PreconditionNotMet(
-                            "The predict data must gather or equal 0."));
-
-      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-      if (label_data[i] > 0) {
-        origin_stat_pos[cur_step_begin + binIdx] += 1;
-      } else if (label_data[i] == 0) {
-        origin_stat_neg[cur_step_begin + binIdx] += 1;
-      }
-    }
-    for (int i = 0; i < bucket_length; ++i) {
-      origin_stat_pos[sum_step_begin + i] +=
-          origin_stat_pos[cur_step_begin + i];
-      origin_stat_neg[sum_step_begin + i] +=
-          origin_stat_neg[cur_step_begin + i];
-    }
-  }
-
-  inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg,
-                             int num_thresholds, double *auc) {
-    *auc = 0.0f;
-
-    double totPos = 0.0;
-    double totNeg = 0.0;
-    double totPosPrev = 0.0;
-    double totNegPrev = 0.0;
-
-    int idx = num_thresholds;
-
-    while (idx >= 0) {
-      totPosPrev = totPos;
-      totNegPrev = totNeg;
-      totPos += stat_pos[idx];
-      totNeg += stat_neg[idx];
-      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-      --idx;
-    }
-
-    if (totPos > 0.0 && totNeg > 0.0) {
-      *auc = *auc / totPos / totNeg;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 90e6a36220ab04087cd02abd76f6c3598425573c..812c55cdd5055186d7fd83a2057d88256f3b34a3 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -150,4 +150,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 // TODO(jczaja): Enable FP32 when performance is good
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LayerNormMKLDNNOpKernel<float>,
                    ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index 780c6e7f153e7b1179e203bc7807dd7818aa591a..a3b764b0e1c46ab91b989ed7f7b0b5df101f7654 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -13,19 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/shape_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
 
 template <typename T>
-class ShapeMKLDNNKernel : public ShapeKernel<T> {
+class ShapeMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ShapeKernel<T>::Compute(ctx);
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
 
     auto* out = ctx.Output<Tensor>("Out");
     out->set_layout(framework::DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0..23428dd403e9b1ef62007c7b9193ed3b8482cab3 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -29,11 +29,11 @@ USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
 
 namespace paddle {
@@ -55,7 +55,7 @@ class CacheTester {
     onednn_dev_ctx_->ResetBlobMap(nullptr);
   }
 
-  bool Analyze(unsigned short int num_entries) {
+  bool Analyze(uint16_t num_entries) {
     //  Number of created objects in cache should be as expected (num_entries)
     return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries;
   }
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index c776cf2a7c792c429fcf45a367d3f06bf9add5d2..e9dadd5ec937cd11c84777a582cc1f7ac9fc3c33 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -27,7 +27,7 @@
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 52e2caaeb6ee129b6971d29dac41465b0373d5e3..9d0062e31388413fd4a441687631faebe8846c6e 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -24,14 +24,17 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(transpose);
+USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index f88286288317bd8e7c09cbd23ecccfce5df98e7d..6e3bd5e43c9c1d7e5c8a5dd4ba37afcfd7147e20 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -21,9 +21,8 @@ limitations under the License. */
 
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
-namespace math = paddle::operators::math;
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MLU);
 
 // relu
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 9de03582cbbf53e843e5f4531a6da6c1c2a87dd5..1fdaa153e3c27ed1a83696bf03d68dbfd2b93ae9 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
+/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx,
+                                  const int pack_num, const int axis,
+                                  const cnnlTensorDescriptor_t inputs_desc[],
+                                  const void* const inputs[],
+                                  const cnnlTensorDescriptor_t output_desc,
+                                  void* output) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc,
+                                        inputs, workspace_ptr, workspace_size,
+                                        output_desc, output));
+}
+
 /* static */ void MLUCnnl::Div(
     const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
     const cnnlTensorDescriptor_t in0_desc, const void* in0,
@@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                        output_descs, output_ptrs));
 }
 
+/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num,
+                                 int axis,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input_ptr,
+                                 const cnnlTensorDescriptor_t output_descs[],
+                                 void* output_ptrs[]) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc,
+                                       input_ptr, workspace_ptr, workspace_size,
+                                       output_descs, output_ptrs));
+}
+
 /* static */ void MLUCnnl::GatherFunctor(
     const ExecutionContext& ctx, const int axis, const int batch_dims,
     const cnnlTensorDescriptor_t params_desc, const void* params,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 2cbecba9fa081970221242555b6b805ff9acae83..b55b10686e92e2b1b5b3a7390289f8329ac04a04 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -403,6 +403,11 @@ class MLUCnnl {
                      const void* const inputs[],
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num,
+                     const int axis, const cnnlTensorDescriptor_t inputs_desc[],
+                     const void* const inputs[],
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type,
                    const cnnlTensorDescriptor_t input_desc, const void* input,
                    const cnnlTensorDescriptor_t output_desc, void* output);
@@ -566,6 +571,12 @@ class MLUCnnl {
                     const cnnlTensorDescriptor_t output_descs[],
                     void* output_ptrs[]);
 
+  static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis,
+                    const cnnlTensorDescriptor_t input_desc,
+                    const void* input_ptr,
+                    const cnnlTensorDescriptor_t output_descs[],
+                    void* output_ptrs[]);
+
   static void Scale(const ExecutionContext& ctx, const int axis,
                     const cnnlTensorDescriptor_t input_desc, const void* input,
                     const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
@@ -1157,19 +1168,22 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
                                    const Tensor* transformed_input,
                                    Tensor* transformed_output,
                                    bool need_reshape_or_alloc) {
-  auto in_dims_vec = phi::vectorize(transformed_input->dims());
+  const int dim_size = perm.size();
   if (need_reshape_or_alloc) {
+    std::vector<int> output_shape;
+    auto input_dims = transformed_input->dims();
+    for (int i = 0; i < dim_size; ++i) {
+      output_shape.push_back(input_dims[perm[i]]);
+    }
     transformed_output->mutable_data<T>(
-        {in_dims_vec[perm[0]], in_dims_vec[perm[1]], in_dims_vec[perm[2]],
-         in_dims_vec[perm[3]]},
-        ctx.GetPlace());
+        framework::DDim(output_shape.data(), dim_size), ctx.GetPlace());
   }
   MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY,
                                   ToCnnlDataType<T>());
   MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY,
                                    ToCnnlDataType<T>());
 
-  MLUCnnl::Transpose(ctx, perm, in_dims_vec.size(), trans_in_desc.get(),
+  MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(),
                      GetBasePtr(transformed_input), trans_out_desc.get(),
                      GetBasePtr(transformed_output));
 }
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
index afb949d3374c62f561e910ea77e516bdb4004ac0..2bacda8afb0eb340c4c8d4068f3013e2adbc7f91 100644
--- a/paddle/fluid/operators/mode_op.cu
+++ b/paddle/fluid/operators/mode_op.cu
@@ -24,7 +24,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index fe4609b3ad91e703fc28a997d5505d4cffa001a8..b309e1b87ef9033bd4302cdad4ea60a64cbf02eb 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape(
   return out_dim;
 }
 
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
-                                const framework::Tensor& matrix_a,
-                                const framework::Tensor& matrix_b,
-                                const framework::DDim& a_dim,
-                                const framework::DDim& b_dim) {
-  auto place = ctx.GetPlace();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-  framework::Tensor matrix_c;
-  framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
-  matrix_c.Resize(c_dim);
-  matrix_c.mutable_data<T>(place);
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
-  const T alpha = static_cast<T>(1.0);
-  blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
-  return matrix_c;
-}
-
-/**
- * @brief Recursively calculate matrix multiplication according to the optimal
- * order
- * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
- *
- * @param
- * ins: the input tensors
- * ins_dims: the shape of ins after reshape
- * order: the optimal order
- * i: the left of sub chain
- * j: the righe of sub chain
- * save_result: set true by backward
- * results: save the intermediate result during backward
- */
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatChainMul(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims,
-    const std::vector<uint64_t>& order, const uint64_t i, const uint64_t j,
-    const bool save_result, std::vector<framework::Tensor>* results) {
-  if (i == j) {
-    return *ins[i];
-  }
-
-  const auto A = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, i,
-                                               order[i * ins.size() + j],
-                                               save_result, results);
-  framework::DDim a_dim = A.dims();
-  if (i == order[i * ins.size() + j]) {
-    a_dim = ins_dims[i];
-  }
-
-  const auto B = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order,
-                                               order[i * ins.size() + j] + 1, j,
-                                               save_result, results);
-  framework::DDim b_dim = B.dims();
-  if (j == order[i * ins.size() + j] + 1) {
-    b_dim = ins_dims[j];
-  }
-
-  auto result = MatMul<DeviceContext, T>(ctx, A, B, a_dim, b_dim);
-  if (save_result) {
-    (*results)[i * ins.size() + j] = result;
-  }
-  return result;
-}
-
-/**
- * @brief get the optimal order
- */
-std::vector<uint64_t> GetOrder(const std::vector<const framework::Tensor*>& ins,
-                               const std::vector<framework::DDim>& ins_dims) {
-  auto n = ins.size();
-  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
-  std::vector<uint64_t> p(n + 1);
-  for (uint64_t i = 0; i < n; i++) {
-    p[i] = ins_dims[i][0];
-  }
-  p[n] = ins_dims[n - 1][1];
-
-  // m[i, j]: save the lowest cost for multiplying ins[i...j]
-  std::vector<uint64_t> m(n * n, 0);
-  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
-  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
-  // multiply the resulting matrices is the optimal order for ins[i...j]
-  std::vector<uint64_t> order(n * n);
-  for (uint64_t l = 1; l < n; l++) {
-    for (uint64_t i = 0; i < n - l; i++) {
-      auto j = i + l;
-      m[i * n + j] = 0xffffffff;
-      for (uint64_t k = i; k < j; k++) {
-        uint64_t q =
-            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
-        if (q < m[i * n + j]) {
-          m[i * n + j] = q;
-          order[i * n + j] = k;
-        }
-      }
-    }
-  }
-  return order;
-}
-
-template <typename DeviceContext, typename T>
-static inline framework::Tensor MultiDotMatChainOrder(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims, const bool save_result,
-    std::vector<framework::Tensor>* results) {
-  auto order = GetOrder(ins, ins_dims);
-  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0,
-                                       ins.size() - 1, save_result, results);
-}
-
-inline void GetDims(const std::vector<const framework::Tensor*>& ins,
-                    std::vector<framework::DDim>* ins_dims) {
-  const auto n = ins.size();
-  for (size_t i = 0; i < n; i++) {
-    (*ins_dims)[i] = ins[i]->dims();
-    if (i == 0 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
-    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
-    }
-  }
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel {
   }
 };
 
-/**
- * 1. there are only 2 matrices: direct matrix multiplication A*B
- * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C),
- *  choose the least cost order for calculation
- * 3. more than 3 matrices: call MultiDotMatChainOrder
- */
-template <typename DeviceContext, typename T>
-class MultiDotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto n = ins.size();
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    const T scale = static_cast<T>(1.0);
-    if (n == 2) {
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ma, Nb});
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
-      } else {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ka * Nc * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ka, Nc});
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
-      }
-    } else {
-      std::vector<framework::Tensor> results;
-      const auto tmp = MultiDotMatChainOrder<DeviceContext, T>(
-          ctx, ins, ins_dims, false, &results);
-      auto out_dim = out->dims();
-      *out = tmp;
-      out->Resize(out_dim);
-    }
-  }
-};
-
 class MultiDotOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class MultiDotGradKernel : public framework::OpKernel<T> {
- public:
-  /**
-   * @brief calculate dA and dB
-   * dA = dout * transpose(B)
-   * dB = transpose(A) * dout
-   */
-  void CalcGrad(const framework::ExecutionContext& ctx,
-                const framework::Tensor& dout, const framework::Tensor& A,
-                const framework::Tensor& B, const framework::DDim& dout_dim,
-                const framework::DDim& a_dim, const framework::DDim& b_dim,
-                framework::Tensor* dA, framework::Tensor* dB) const {
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
-    T alpha = static_cast<T>(1.0);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
-    blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
-  }
-
-  /**
-   * @brief calculate multi matrix multiplication grad by a chain order
-   * @param
-   * dout: the grad of multi matrix multiplication out
-   * dx: the out grad of inputs
-   * ins: the input tensors
-   * ins_dims: the shape of ins after reshape
-   * order: the optimal order
-   * i: the left of sub chain
-   * j: the righe of sub chain
-   * results: the intermediate result of farward
-   */
-  void MatChainMulGrad(const framework::ExecutionContext& ctx,
-                       const framework::Tensor& dout,
-                       std::vector<framework::Tensor*>* dx,
-                       const std::vector<const framework::Tensor*>& ins,
-                       const framework::DDim& dout_dim,
-                       const std::vector<framework::DDim>& ins_dims,
-                       const std::vector<uint64_t>& order, const uint64_t i,
-                       const uint64_t j,
-                       const std::vector<framework::Tensor>& results) const {
-    if (i == j) {
-      *((*dx)[i]) = dout;
-      return;
-    }
-
-    const auto n = ins.size();
-    const auto right = order[i * n + j];
-    const auto left = order[i * n + j] + 1;
-    // get the multi result of left sub chain
-    const auto* A = &results[i * n + right];
-    framework::DDim a_dim = A->dims();
-    if (i == right) {
-      A = ins[i];
-      a_dim = ins_dims[i];
-    }
-    // get the multi result of right sub chain
-    const auto* B = &results[left * n + j];
-    framework::DDim b_dim = B->dims();
-    if (left == j) {
-      B = ins[j];
-      b_dim = ins_dims[j];
-    }
-    framework::Tensor dA, dB;
-    dA.Resize({dout_dim[0], b_dim[0]});
-    dB.Resize({a_dim[1], dout_dim[1]});
-    dA.mutable_data<T>(ctx.GetPlace());
-    dB.mutable_data<T>(ctx.GetPlace());
-
-    CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
-    MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right,
-                    results);
-    MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j,
-                    results);
-  }
-
-  void MultiDotGradMatChainOrder(
-      const framework::ExecutionContext& ctx, const framework::Tensor& dout,
-      const std::vector<const framework::Tensor*>& ins,
-      const framework::DDim& dout_dim,
-      const std::vector<framework::DDim>& ins_dims,
-      std::vector<framework::Tensor*>* dx) const {
-    auto order = GetOrder(ins, ins_dims);
-    auto n = ins.size();
-    std::vector<framework::Tensor> results(n * n);
-    MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1, true,
-                                  &results);
-    MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1,
-                    results);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    auto place = ctx.GetPlace();
-
-    const auto n = ins.size();
-    for (size_t i = 0; i < n; i++) {
-      dx[i]->mutable_data<T>(place);
-    }
-
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    framework::DDim dout_dim = dout.dims();
-    if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
-      dout_dim = phi::make_ddim({1, 1});
-    } else if (ins[0]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({1, dout_dim[0]});
-      }
-    } else if (ins[n - 1]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({dout_dim[0], 1});
-      }
-    }
-
-    T alpha = static_cast<T>(1);
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    if (n == 2) {
-      CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
-               dx[0], dx[1]);
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ma, Nb});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({mat_dim_dout.height_, Nb});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(),
-                 ins_dims[2], &tmp_dout, dx[2]);
-        CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0],
-                 ins_dims[1], dx[0], dx[1]);
-      } else {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ka, Nc});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({Ka, mat_dim_dout.width_});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0],
-                 tmp_dout.dims(), dx[0], &tmp_dout);
-        CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1],
-                 ins_dims[2], dx[1], dx[2]);
-      }
-    } else {
-      MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx);
-      if (ins[n - 1]->dims().size() == 1) {
-        dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
-      }
-    }
-  }
-};
-
 template <typename T>
 class MultiDotOpGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, float>);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 1143f9cb37aa54bea430d3a8bca8b62b02da4e2b..0113f638b9a47d161c890a0f547f8680af4018e7 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
-                            PT_INFER_META(phi::MultinomialInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
+                            PD_INFER_META(phi::MultinomialInferMeta));
 REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index ab9f10070fc60deab8974ae0e81e2b4c6cef2ffd..bf7222fc45c66085473eae627abe97b8a41d4268 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
-    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
-
-    auto dim_x = context->GetInputDim("X");
-    auto dim_vec = context->GetInputDim("Vec");
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of input X should be 2, but is %d", dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_vec.size(), 1,
-        platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
-                      platform::errors::InvalidArgument(
-                          "X's second dimension is expected to be equal to "
-                          "Vec's first dimension"
-                          "but recieved X'shape = [%s], Vec's shape = [%s]",
-                          dim_x, dim_vec));
-
-    framework::DDim dim_out = phi::make_ddim({dim_x[0]});
-
-    context->SetOutputDim("Out", dim_out);
-    context->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 template <typename T>
@@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor,
+                            PD_INFER_META(phi::MvInferMeta));
+
 REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>,
+                  MvInferShapeFunctor);
 REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index f510c7bebec876d034c1af923a4f7077c096000c..a4e1f7b3091a9f692e479300310333bfdd359096 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight",
-                   "NLLLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true,
-                      platform::errors::InvalidArgument(
-                          "The tensor rank of Input(X) must be 2 or 4."));
-    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) ||
-                               phi::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], label_dims[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: Expected input batch_size to match label batch_size,"
-              "But received: the Input(x) batch_size is [%s], the Input(label) "
-              " batch_size is [%s].",
-              x_dims[0], label_dims[0]));
-      if (ctx->HasInput("Weight")) {
-        auto w_dims = ctx->GetInputDim("Weight");
-        PADDLE_ENFORCE_EQ(w_dims.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "Input(Weight) should be a 1D tensor."));
-        PADDLE_ENFORCE_EQ(
-            x_dims[1], w_dims[0],
-            platform::errors::InvalidArgument(
-                "Expected input tensor Weight's size should equal "
-                "to the first dimension of the input tensor X. But received "
-                "Weight's "
-                "size is %d, the first dimension of input X is %d",
-                w_dims[0], x_dims[1]));
-      }
-    }
-    if (x_dims.size() == 2) {
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    } else if (x_dims.size() == 4) {
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        platform::errors::InvalidArgument(
-                            "Expected Input(Lable) dimensions=3, received %d.",
-                            label_dims.size()));
-      auto input0 = x_dims[0];
-      auto input2 = x_dims[2];
-      auto input3 = x_dims[3];
-      auto label0 = label_dims[0];
-      auto label1 = label_dims[1];
-      auto label2 = label_dims[2];
-      PADDLE_ENFORCE_EQ(
-          input0 == label0 && input2 == label1 && input3 == label2, true,
-          platform::errors::InvalidArgument("Input(X) tensor shape should "
-                                            "match to Input(Label) tensor "
-                                            "shape."));
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    }
-    ctx->SetOutputDim("Total_weight", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -259,15 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor,
+                            PD_INFER_META(phi::NllLossRawInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker,
                   ops::NLLLossGradMaker<paddle::framework::OpDesc>,
-                  ops::NLLLossGradMaker<paddle::imperative::OpBase>);
+                  ops::NLLLossGradMaker<paddle::imperative::OpBase>,
+                  NllLossRawInferShapeFunctor);
 REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss, ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
deleted file mode 100644
index be6f4422d4ac6a475477c025c4b76eabdbf4f9e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nll_loss_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int64_t i = 0; i < batch_size; ++i) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        out_data[i] = 0;
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                        platform::errors::InvalidArgument(
-                            "Label value is out of range. "
-                            "Expected label value in range of [0, %d), but "
-                            "received value is %d.",
-                            n_classes, cur_label));
-
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int64_t i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      out_data[i] = 0;
-      continue;
-    }
-    PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                      platform::errors::InvalidArgument(
-                          "label should not be out of bounds."));
-
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    total_weight_val += cur_weight;
-    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
-  }
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename T>
-static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const int64_t in_dim2, const int64_t in_dim3,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            out_data[index] = 0;
-            continue;
-          }
-          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                            platform::errors::InvalidArgument(
-                                "label should not be out of bounds."));
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
-                                    h * in_dim3 + w] *
-                            cur_weight;
-        }
-      }
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          out_data[index] = 0;
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                          platform::errors::InvalidArgument(
-                              "label should not be out of bounds."));
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        total_weight_val += cur_weight;
-        output_val -=
-            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
-            cur_weight;
-      }
-    }
-  }
-
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    *total_weight_data = 0;
-
-    auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_1D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, reduction,
-                     ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_2D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, in_dim2, in_dim3,
-                     reduction, ignore_index);
-    }
-  }
-};
-
-template <typename T>
-static void nll_loss_grad_1D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        continue;
-      }
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      continue;
-    }
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
-    if (reduction == "mean") {
-      dx_data[i * n_classes + cur_label] /= total_weight_val;
-    }
-  }
-}
-
-template <typename T>
-static void nll_loss_grad_2D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const int64_t in_dim2, const int64_t in_dim3,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            continue;
-          }
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
-              -cur_weight * dout_data[index];
-        }
-      }
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          continue;
-        }
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        const auto dx_index =
-            i * sample_size + cur_label * map_size + h * in_dim3 + w;
-        dx_data[dx_index] = -dout_val * cur_weight;
-        if (reduction == "mean") {
-          dx_data[dx_index] /= total_weight_val;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-
-    const auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, reduction,
-                       ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, in_dim2,
-                       in_dim3, reduction, ignore_index);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index c400a8f4239a605414bf0d99a6a89b0ddae6c535..0ed1f2719de25bd2c138c23dd69b914a66961464 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
 }
 
 template <typename DeviceContext, typename T>
-void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+void NormDoubleGradFunctor(const DeviceContext &ctx,
                            const DataLayout data_layout, const Tensor *X,
                            const Tensor *Scale, const Tensor *dY,
                            const Tensor *Saved_mean,
-                           const Tensor *Saved_variance, const double epsilon,
+                           const Tensor *Saved_variance, const Tensor *Mean,
+                           const Tensor *Variance, const double epsilon,
                            const bool use_global_stats, const Tensor *ddX,
                            const Tensor *ddScale, const Tensor *ddBias,
                            Tensor *dX, Tensor *dScale, Tensor *ddY) {
@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
   const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
 
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  phi::funcs::SetConstant<DeviceContext, T> set_constant;
 
   auto &x_dims = X->dims();
   const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   Tensor scale_tmp;
   if (!Scale) {
     scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
 #ifdef __HIPCC__
@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 #else
   const int block = 512;
 #endif
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
   int grid1 = (num + block - 1) / block;
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean = Mean;
+    const auto *running_var = Variance;
     const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
     mean_data = running_mean_data;
@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   } else {
     const T *smean_data = Saved_mean->data<T>();
     const T *svariance_data = Saved_variance->data<T>();
+
     mean_data = smean_data;
     variance_data = svariance_data;
   }
 
   if (dX) {
     T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dX, static_cast<T>(0));
+    set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       } else {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       }
@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (dScale) {
     T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       } else {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       } else {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       }
@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (ddY) {
     T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       } else {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       }
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index b96fcaa486cce8099cf1d03c7d948ea74c1923ad..372a71706ab5ec72b6da4cbac1b63333f42cb265 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -17,8 +17,10 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add_grad);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index ad7f93d73e902bbac684832d3a77ba83b517daf6..315831ddc0f290cc8c7ad1b78ce8625722f91d3b 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Param) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Grad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredGrad"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredUpdate"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("ParamOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(ParamOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredGradOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredUpdateOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and grad input of AdadeltaOp should have same dimension."));
-    PADDLE_ENFORCE_NE(
-        phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-        platform::errors::InvalidArgument(
-            "Maybe the Input variable AvgSquaredGrad has not "
-            "been initialized. You may need to confirm if you put "
-            "exe.run(startup_program) after optimizer.minimize "
-            "function."));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredGrad input of AdadeltaOp "
-                          "should have same dimension"));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredUpdate input of AdadeltaOp "
-                          "should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -149,7 +81,11 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor,
+                            PD_INFER_META(phi::AdadeltaInferMeta));
+REGISTER_OPERATOR(
+    adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdadeltaInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
deleted file mode 100644
index 85cfad35858bbe6b112169f196c0711d981e9446..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdadeltaOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto avg_squared_grad_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
-    auto avg_squared_update_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T rho = static_cast<T>(ctx.Attr<float>("rho"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    // Squared gradient accumulator
-    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
-    // Squared updates accumulator
-    auto avg_squared_update = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto avg_squared_grad_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
-    auto avg_squared_update_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    avg_squared_grad_out.device(place) =
-        rho * avg_squared_grad + (1 - rho) * grad.square();
-    auto update =
-        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
-             .sqrt() *
-        grad;
-    avg_squared_update_out.device(place) =
-        rho * avg_squared_update + (1 - rho) * update.square();
-    param_out.device(place) = param + update;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index a95a37c980c8c9d41dc9fd352e3dace787a7c4e9..036839dd1300feac544a6f1ca661598f4360f745 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut",
-                   "Adamax");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 dimension"));
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Beta1 power accumulator should have 1 dimension"));
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and Grad input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        platform::errors::InvalidArgument(
-            "Param and Moment input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("InfNorm"),
-        platform::errors::InvalidArgument(
-            "Param and InfNorm input of AdamaxOp should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-    ctx->SetOutputDim("InfNormOut", param_dims);
-  }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -150,7 +92,11 @@ division by 0 error.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor,
+                            PD_INFER_META(phi::AdamaxInferMeta));
+
+REGISTER_OPERATOR(
+    adamax, ops::AdamaxOp, ops::AdamaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdamaxInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
deleted file mode 100644
index df0112448b1cbc82d699dc1ee6f3444bda3b142b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamaxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto inf_norm = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("InfNorm"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto inf_norm_out =
-        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(*place) =
-        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    auto lr_t = lr / (1 - beta1_pow);
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(*place) =
-        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index ab8b4f2b8f4d37d4be62c5e1dd040a1461d0bdee..a3fbb0e59e24e9be67da5048ebc644f08b385bbf 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx,
   PADDLE_ENFORCE_NE(
       static_cast<const void *>(x), static_cast<void *>(y),
       platform::errors::InvalidArgument("Inplace cast is not supported yet."));
-  int vec_size =
-      std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y));
+  int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
   switch (vec_size) {
     case 4:
       return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index bea019f1f36e2ea21890f23b753b4df1d62c0e3b..c86f544ed77ff13cc59735971cf856f66bc12202 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -17,7 +17,7 @@
 #include <memory>
 
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
+#include "paddle/phi/kernels/sgd_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -26,8 +26,7 @@ template <typename DeviceContext, typename T>
 class DGCMomentumKernel : public framework::OpKernel<T> {
  public:
   DGCMomentumKernel()
-      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()),
-        _sgd_op_kernel(new SGDOpKernel<DeviceContext, T>()) {}
+      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()) {}
 
   void Compute(const framework::ExecutionContext& context) const override {
     auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
@@ -67,12 +66,68 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
     }
 
     VLOG(10) << " so use sgd optimizer";
-    return _sgd_op_kernel->Compute(context);
+
+    const auto* param_var = context.InputVar("Param");
+    const auto* grad_var = context.InputVar("Grad");
+    auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
+    bool multi_precision = context.Attr<bool>("multi_precision");
+    if (param_var->IsType<framework::LoDTensor>()) {
+      auto* param = context.Input<framework::Tensor>("Param");
+      auto* param_out = context.Output<framework::Tensor>("ParamOut");
+      auto* master_param_out =
+          context.Output<framework::Tensor>("MasterParamOut");
+      paddle::optional<const framework::Tensor&> master_param_opt =
+          paddle::none;
+      if (multi_precision) {
+        auto* master_param = context.Input<framework::Tensor>("MasterParam");
+        master_param_opt = *master_param;
+      }
+
+      if (grad_var->IsType<framework::Tensor>()) {
+        // sgd_dense
+        auto* grad = context.Input<framework::Tensor>("Grad");
+        phi::SGDDenseKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *learning_rate, *grad, master_param_opt, multi_precision,
+            param_out, master_param_out);
+      } else {
+        // sgd dense param sparse grad
+        auto* grad = context.Input<phi::SelectedRows>("Grad");
+        phi::SGDDenseParamSparseGradKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *learning_rate, *grad, master_param_opt, multi_precision,
+            param_out, master_param_out);
+      }
+    } else if (param_var->IsType<phi::SelectedRows>() &&
+               grad_var->IsType<phi::SelectedRows>() &&
+               platform::is_cpu_place(context.GetPlace())) {
+      // sgd sparse param sparse grad
+      auto* param = context.Input<phi::SelectedRows>("Param");
+      auto* param_out = context.Output<phi::SelectedRows>("ParamOut");
+      auto* master_param_out =
+          context.Output<phi::SelectedRows>("MasterParamOut");
+      paddle::optional<const phi::SelectedRows&> master_param_opt =
+          paddle::none;
+      if (multi_precision) {
+        auto* master_param = context.Input<phi::SelectedRows>("MasterParam");
+        master_param_opt = *master_param;
+      }
+      auto* grad = context.Input<phi::SelectedRows>("Grad");
+      phi::SGDSparseParamSparseGradKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *param, *learning_rate, *grad, master_param_opt, multi_precision,
+          param_out, master_param_out);
+
+    } else {
+      PADDLE_THROW("gdc not support yet");
+    }
   }
 
  private:
   std::unique_ptr<MomentumOpKernel<DeviceContext, T>> _momentum_op_kernel;
-  std::unique_ptr<SGDOpKernel<DeviceContext, T>> _sgd_op_kernel;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
index 28c6efef14178535d7f9473c2310552037952c9f..efec50efa92ea68cb68934bde32e1f56570b0868 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
               "The fp32 beta1 power accumulator tensor. Its shape is [1].");
     AddOutput("Beta2Pow",
               "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddOutput("FusedIndices",
-              "The param index of each element in FP32FusedParam. Its shape is "
-              "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
     AddOutput(
         "FusedParamOffsets",
         "The numel offset of each parameter inside the FP32FusedParam. Its "
         "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddOutput("FP32ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp32_local_param_num + 1].");
-    AddOutput("FP16ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp16_local_param_num + 1].");
+        "+ n_2, ...]. It should be in CPUPlace.");
     AddOutput(
-        "WeightDecay",
-        "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "FP32ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddOutput(
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
     AddOutput("ParamInfo",
               "The param info. It should be in CPUPlace, and its shape is [6]"
-              "CPUPlace, and its shape is [6]. It is "
+              "CPUPlace, and its shape is [8]. It is "
               "[fp32_shard_param_start_idx, fp32_local_param_num, "
-              "fp32_global_param_num, fp16_shard_param_start_idx, "
-              "fp16_local_param_num, fp16_global_param_num].");
-
+              "fp32_global_param_num, fp32_weight_decay_end_idx, "
+              "fp16_shard_param_start_idx, "
+              "fp16_local_param_num, fp16_global_param_num, "
+              "fp16_weight_decay_end_idx].");
+    AddOutput("ParamOrder",
+              "The reordered parameter order. Inside this op, "
+              "the parameter would be reordered by data type and weight decay "
+              "value.");
     AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
     AddOutput("MasterParamOut",
               "The output master parameter list. It would share the memory of "
@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker
 
     AddAttr<float>("beta1", "The initial value of Beta1Pow.");
     AddAttr<float>("beta2", "The initial value of Beta2Pow.");
-    AddAttr<std::vector<float>>(
-        "weight_decay",
-        "The weight decay for each parameter. Its "
-        "shape is equal to the global parameter number.");
+    AddAttr<std::vector<int>>("apply_weight_decay",
+                              "Whether to apply weight decay.");
     AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
     AddAttr<int>("rank", "The global rank of the current process.");
     AddAttr<int>("nranks", "The global world size.");
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3445e9b658becda84aa678e9c1f03b3436d63b70..7d8a7186d58b402e208fc749524d996b351abeef 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
            << ") , dtype = " << fused_out->dtype();
 }
 
-template <typename OffsetT, typename IndexT>
-static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
-                                                      IndexT *out,
-                                                      int offset_num,
-                                                      int out_num) {
-  CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
-    auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
-    if (idx == offset_num || offsets[idx] != i) {
-      --idx;
-    }
-    out[i] = idx;
-  }
-}
-
-template <typename T>
-static void CopyVectorToTensor(const std::vector<T> &src,
-                               framework::Tensor *dst,
-                               const platform::Place &place,
-                               gpuStream_t stream) {
-  dst->Resize({static_cast<int64_t>(src.size())});
-  T *dst_ptr = dst->mutable_data<T>(place);
-  const T *src_ptr = src.data();
-  auto nbytes = src.size() * sizeof(T);
-  memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
-}
-
 template <typename T>
 static void CopyVectorToCPUTensor(const std::vector<T> &src,
                                   framework::Tensor *dst) {
@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
   std::memcpy(dst_ptr, src_ptr, nbytes);
 }
 
+static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
+                                       std::vector<ParamGradInfo> *infos) {
+  size_t n = infos->size();
+  std::vector<int> cur_flags;
+  cur_flags.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = (*infos)[i].idx;
+    cur_flags.push_back(flags[idx]);
+  }
+
+  auto origin_infos = *infos;
+  size_t j = 0;
+  for (size_t i = 0; i < n; ++i) {
+    if (cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  size_t ret_idx = j;
+
+  for (size_t i = 0; i < n; ++i) {
+    if (!cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  return ret_idx;
+}
+
+template <typename T>
+static T ClipByBound(T x, T low_value, T high_value) {
+  if (x < low_value) return low_value;
+  if (x > high_value) return high_value;
+  return x;
+}
+
 template <typename T>
 class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
         info->numel_offset = 0;        // not determined yet
       }
     }
+    const auto &apply_weight_decay =
+        ctx.Attr<std::vector<int>>("apply_weight_decay");
+    size_t fp32_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
+    size_t fp16_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
+
+    auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
+    auto param_num = fp32_infos.size() + fp16_infos.size();
+    param_order_t->Resize({static_cast<int16_t>(param_num)});
+    auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
+    for (size_t i = 0; i < fp32_infos.size(); ++i) {
+      param_order[i] = static_cast<int>(fp32_infos[i].idx);
+    }
+    for (size_t i = 0; i < fp16_infos.size(); ++i) {
+      param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
+    }
+
     VLOG(10) << "Fill ParamGradInfo ends";
 
     // Step 2: determine the numel_with_padding and numel_offset
@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     VLOG(10) << "Found the sharding arguments";
 
     auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
-    param_info_t->Resize({6});
+    param_info_t->Resize({8});
     auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
     param_info[0] = static_cast<int>(fp32_start_idx);
     param_info[1] = static_cast<int>(fp32_local_param_num);
     param_info[2] = static_cast<int>(fp32_infos.size());
-    param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size());
-    param_info[4] = static_cast<int>(fp16_local_param_num);
-    param_info[5] = static_cast<int>(fp16_infos.size());
+    param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
+                                     fp32_start_idx + fp32_local_param_num) -
+                    static_cast<int>(fp32_start_idx);
+    param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
+    param_info[5] = static_cast<int>(fp16_local_param_num);
+    param_info[6] = static_cast<int>(fp16_infos.size());
+    param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
+                                     fp16_start_idx + fp16_local_param_num) -
+                    static_cast<int>(fp16_start_idx);
 
     VLOG(10) << "Start FP32 idx: " << param_info[0];
     VLOG(10) << "Local FP32 param num: " << param_info[1];
     VLOG(10) << "Global FP32 param num: " << param_info[2];
 
-    VLOG(10) << "Start FP16 idx: " << param_info[3];
-    VLOG(10) << "Local FP16 param num: " << param_info[4];
-    VLOG(10) << "Global FP16 param num: " << param_info[5];
+    VLOG(10) << "Start FP16 idx: " << param_info[4];
+    VLOG(10) << "Local FP16 param num: " << param_info[5];
+    VLOG(10) << "Global FP16 param num: " << param_info[6];
 
-    // For WeightDecay, shard and perform H2D copy
-    const auto &origin_weight_decay =
-        ctx.Attr<std::vector<float>>("weight_decay");
-    PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "The attr(weight_decay) should have the "
-                          "same length with Input(Param)."));
-    std::vector<float> shard_weight_decay;
-    shard_weight_decay.reserve(total_local_param_num);
-    for (size_t i = 0; i < fp32_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
-    }
-    for (size_t i = 0; i < fp16_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
-    }
-
-    // For FusedIndices, launch CUDA kernel to do binary search
-    auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
-    fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
-    auto *fused_indices = fused_indices_t->mutable_data<int>(place);
     std::vector<int> numel_offsets;
     numel_offsets.reserve(params.size() + 1);
     for (const auto &info : fp32_infos) {
@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                           "The numel_offsets number must be one larger than "
                           "the parameter number."));
     VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
-    auto *fused_param_offset_t =
-        ctx.Output<framework::Tensor>("FusedParamOffsets");
-    fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
-    auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
-    memory::Copy(place, fused_param_offset, platform::CPUPlace(),
-                 numel_offsets.data(),
-                 numel_offsets.size() * sizeof(numel_offsets[0]), stream);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
-    LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
-                                     config.thread_per_block, 0, stream>>>(
-        fused_param_offset, fused_indices, numel_offsets.size() - 1,
-        total_numel);
-
-    std::vector<int> lengths;
-    lengths.reserve(fp32_local_param_num + fp16_local_param_num);
 
     std::vector<int> fp32_partial_numel_offsets;
     fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
       VLOG(10) << "FP32 Partial numel = ["
                << valid_start_n + fp32_infos[i].numel << ","
                << end_n + fp32_infos[i].numel;
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
       fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
     }
 
     std::vector<int> fp16_partial_numel_offsets;
@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
       PADDLE_ENFORCE_NE(valid_start_n, end_n,
                         platform::errors::InvalidArgument(
                             "Indices sharding error. This may be a bug."));
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
       fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
     }
 
     CopyVectorToCPUTensor(numel_offsets,
@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
         fp16_partial_numel_offsets,
         ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
 
-    // Fill the weight decay tensor
-    PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    std::vector<float> wd_cpu;
-    for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
-      int len = lengths[i];
-      for (int j = 0; j < len; ++j) {
-        wd_cpu.push_back(shard_weight_decay[i]);
-      }
-    }
-    PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
-                       place, stream);
-
     auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
     if (!global_scale->IsInitialized()) {
       TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index e5b27446eb330aeb08e134332a5366c6c6ed2908..8f7c87912e93aa1bb3178d37afa641047e15a82b 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
              "The fp32 beta1 power accumulator tensor. Its shape is [1].");
     AddInput("Beta2Pow",
              "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddInput("FusedIndices",
-             "The param index of each element in FP32FusedParam. Its shape is "
-             "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
     AddInput(
         "FusedParamOffsets",
         "The numel offset of each parameter inside the FP32FusedParam. Its "
         "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddInput("FP32ShardFusedParamOffsets",
-             "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp32_local_param_num + 1].");
-    AddInput("FP16ShardFusedParamOffsets",
-             "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp16_local_param_num + 1].");
-    AddInput("WeightDecay",
-             "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "+ n_2, ...]. It should be in CPUPlace.");
+    AddInput(
+        "FP32ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddInput(
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
     AddInput("ParamInfo",
              "The param info. It should be in CPUPlace, and its shape is [6]"
-             "CPUPlace, and its shape is [6]. It is "
+             "CPUPlace, and its shape is [8]. It is "
              "[fp32_shard_param_start_idx, fp32_local_param_num, "
-             "fp32_global_param_num, fp16_shard_param_start_idx, "
-             "fp16_local_param_num, fp16_global_param_num].");
+             "fp32_global_param_num, fp32_weight_decay_end_idx, "
+             "fp16_shard_param_start_idx, "
+             "fp16_local_param_num, fp16_global_param_num, "
+             "fp16_weight_decay_end_idx].");
+    AddInput("ParamOrder",
+             "The reordered parameter order. Inside this op, "
+             "the parameter would be reordered by data type and weight decay "
+             "value.");
 
     AddInput("LearningRate",
              "The fp32 learning rate tensor. Its shape is [1].");
@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         "max_global_grad_norm",
         "The maximum global gradient l2-norm value for clipping. If "
         "max_global_grad_norm <= 0, no clipping would be performed.");
+    AddAttr<float>("weight_decay", "The weight decay value.");
     AddAttr<bool>("clip_after_allreduce",
                   "Whether to clip before allreduce, only valid when the "
                   "world size is larger than 1.");
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 3f90140f77282983f42ef03f736c35960239dd75..5b60f65442b55dc89a845859f153048e89704f70 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -19,11 +19,11 @@
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
 #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
 #include "paddle/fluid/operators/tensor_to_string.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -66,8 +66,8 @@ struct L2NormFunctor {
     int i;
     for (i = threadIdx.x * VecSize; i + VecSize <= size;
          i += (BlockDim * VecSize)) {
-      platform::AlignedVector<T, VecSize> tmp_vec;
-      platform::Load(ptr + i, &tmp_vec);
+      phi::AlignedVector<T, VecSize> tmp_vec;
+      phi::Load(ptr + i, &tmp_vec);
 #pragma unroll
       for (int j = 0; j < VecSize; ++j) {
         auto tmp = static_cast<MT>(tmp_vec[j]);
@@ -87,7 +87,7 @@ struct L2NormFunctor {
   }
 };
 
-template <typename InT, typename OutT, int BlockDim, bool NeedSqrt>
+template <typename InT, typename OutT, int BlockDim>
 static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
     const InT *x, OutT *y, int max_chunk_num) {
   int tensor_id = blockIdx.x;
@@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
   }
   sum = BlockReduce(storage).Reduce(sum, cub::Sum());
   if (threadIdx.x == 0) {
-    if (NeedSqrt) {
-      y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
-    } else {
-      y[blockIdx.x] = static_cast<OutT>(sum);
-    }
+    y[blockIdx.x] = static_cast<OutT>(sum);
   }
 }
 
@@ -115,9 +111,10 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   constexpr int max_load_bits = 128;
   int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
   auto address = reinterpret_cast<uintptr_t>(ptr);
-  constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
-  constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
-  constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  constexpr int vec8 = alignof(phi::AlignedVector<T, 8>);
+  constexpr int vec4 = alignof(phi::AlignedVector<T, 4>);
+  constexpr int vec2 = alignof(phi::AlignedVector<T, 2>);
+  chunk_size *= sizeof(T);
   if (address % vec8 == 0 && chunk_size % vec8 == 0) {
     return std::min(8, valid_vec_size);
   } else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
@@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   }
 }
 
-#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \
-  case __vec_size: {                                    \
-    constexpr int kVecSize = __vec_size;                \
-    __VA_ARGS__;                                        \
-    break;                                              \
+#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \
+  case __vec_size: {                               \
+    constexpr int kVecSize = __vec_size;           \
+    __VA_ARGS__;                                   \
+    break;                                         \
   }
 
-#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...)    \
-  do {                                                \
-    switch (__vec_size) {                             \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \
-    }                                                 \
+#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...)    \
+  do {                                           \
+    switch (__vec_size) {                        \
+      PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \
+    }                                            \
   } while (0)
 
 // TODO(zengjinle): which chunk_size is better?
-template <typename InT, typename OutT, bool NeedSqrt = false,
-          int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680,
-          int BlockDim = 512>
+template <typename InT, typename OutT, int MaxTensorNumPerLaunch = 160,
+          int MaxChunkNumPerLaunch = 780>
 static void MultiTensorL2Norm(const platform::CUDAPlace &place,
                               gpuStream_t stream, const InT *x,
                               const int *offsets, int n, OutT *y,
@@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
 
   constexpr int kNumTensor = MaxTensorNumPerLaunch;
   constexpr int kNumChunk = MaxChunkNumPerLaunch;
-  constexpr int kBlockDim = BlockDim;
+  constexpr int kBlockDim = 512;
 
   int max_chunk_num = -1;
   int vec_size = 8;
@@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
   auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
   FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
 
-#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL                         \
-  do {                                                              \
-    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;       \
-    VLOG(10) << __func__ << " " << typeid(InT).name()               \
-             << " VecSize = " << kVecSize;                          \
-    MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>(   \
-        FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \
-        max_chunk_num);                                             \
+#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL                            \
+  do {                                                                         \
+    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;                  \
+    VLOG(10) << __func__ << " " << typeid(InT).name()                          \
+             << " VecSize = " << kVecSize;                                     \
+    MultiTensorApply<FunctorT, kNumTensor, kNumChunk>(                         \
+        FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \
+        max_chunk_num);                                                        \
   } while (0)
 
-  PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL);
-#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
+#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
 
-  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim,
-                                         NeedSqrt><<<n, kBlockDim, 0, stream>>>(
-      tmp_out_ptr, y, max_chunk_num);
+  MultiTensorL2NormReduceAgainCUDAKernel<
+      MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
+                                                        max_chunk_num);
 }
 
 template <int LogLevel>
@@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm(
   auto tensors = ctx.MultiInput<framework::Tensor>("Param");
   if (tensors.empty()) return;
 
+  const auto *order = ctx.Input<framework::Tensor>("ParamOrder")->data<int>();
+
   size_t n = tensors.size();
   auto place = tensors[0]->place();
 
   auto pn_vec = ToVector(param_square_norm, n, place);
   auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place);
 
-  std::vector<size_t> fp32_indices, fp16_indices;
-  fp32_indices.reserve(n);
-  fp16_indices.reserve(n);
-  for (size_t i = 0; i < n; ++i) {
-    const auto *t = tensors[i];
-    if (t->dtype() == phi::DataType::FLOAT32) {
-      fp32_indices.push_back(i);
-    } else if (t->dtype() == phi::DataType::FLOAT16) {
-      fp16_indices.push_back(i);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported data type %s.", t->dtype()));
-    }
-  }
-
-  for (auto idx : fp16_indices) {
-    fp32_indices.push_back(idx);
-  }
-
   const auto &names = ctx.GetOp().Inputs("Param");
-  for (size_t i = 0; i < fp32_indices.size(); ++i) {
-    auto idx = fp32_indices[i];
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = order[i];
     VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx]
                    << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i];
   }
@@ -325,14 +304,30 @@ struct AndFunctor {
   HOSTDEVICE bool operator()(bool x, bool y) const { return x && y; }
 };
 
-template <typename T1, typename T2>
+template <typename T1, typename T2, int VecSize>
 static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
                                        const T2 *__restrict__ scale,
                                        T1 *__restrict__ y, int num) {
   static_assert(sizeof(T1) <= sizeof(T2),
                 "sizeof(T1) must be not greater than sizeof(T2).");
   T2 s = scale[0];
-  CUDA_KERNEL_LOOP(i, num) {
+
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = blockDim.x * gridDim.x * VecSize;
+
+  for (; i + VecSize <= num; i += stride) {
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T1, VecSize> y_vec;
+
+    phi::Load(x + i, &x_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
+    }
+    phi::Store(y_vec, y + i);
+  }
+
+  for (; i < num; ++i) {
     y[i] = static_cast<T1>(static_cast<T2>(x[i]) * s);
   }
 }
@@ -353,7 +348,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale(
     const T1 *__restrict__ global_scale, T1 max_global_grad_norm,
     const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1,
     T2 *__restrict__ out2, T1 clip_rescale_grad) {
-  T1 grad_norm = static_cast<T1>(sqrt(*square_grad_norm)) * clip_rescale_grad;
+  T1 grad_norm = static_cast<T1>(sqrtf(*square_grad_norm)) * clip_rescale_grad;
   T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm);
   bool found_nan_inf = !isfinite(scale);
   if (scale >= 1 || found_nan_inf) {
@@ -380,19 +375,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1,
       ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f;
 }
 
-// TODO(zengjinle): Vectorize this function
-// NOTE: this method does not update Beta1Pow and Beta2Pow!
-template <typename T, typename GradT, typename IndexT>
-static __global__ void UpdateLambMoment(
+template <typename T, typename GradT, int VecSize>
+static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
     const T *__restrict__ param_p, const GradT *__restrict__ grad_p,
     const T *__restrict__ square_grad_norm_p,
-    const T *__restrict__ global_scale, const IndexT *__restrict__ indices,
-    const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p,
+    const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p,
     const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p,
-    T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2,
-    T epsilon, T max_global_grad_norm, int num, T rescale_grad) {
+    T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf,
+    T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon,
+    T max_global_grad_norm, int num, T rescale_grad) {
   T square_grad_norm = *square_grad_norm_p;
-  if (!isfinite(square_grad_norm)) return;
+  bool need_update_found_inf =
+      (found_inf && threadIdx.x == 0 && blockIdx.x == 0);
+  if (!isfinite(square_grad_norm)) {
+    if (need_update_found_inf) *found_inf = true;
+    return;
+  } else if (need_update_found_inf) {
+    *found_inf = false;
+  }
 
   T scale = rescale_grad / global_scale[0];
   if (max_global_grad_norm > 0) {
@@ -406,27 +406,111 @@ static __global__ void UpdateLambMoment(
   T one_minus_beta1pow = 1 - beta1pow_p[0];
   T one_minus_beta2pow = 1 - beta2pow_p[0];
 
-  CUDA_KERNEL_LOOP(i, num) {
-    T p = param_p[i];
-    T g = static_cast<T>(grad_p[i]) * scale;
-    T weight_decay = weight_decay_p[i];
-    T mom1 = mom1_p[i];
-    T mom2 = mom2_p[i];
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = blockDim.x * gridDim.x * VecSize;
 
-    mom1 = beta1 * mom1 + (1 - beta1) * g;
-    mom2 = beta2 * mom2 + (1 - beta2) * g * g;
+  for (; i + VecSize <= num; i += stride) {
+    phi::AlignedVector<T, VecSize> param_vec;
+    phi::AlignedVector<GradT, VecSize> grad_vec;
+    phi::AlignedVector<T, VecSize> mom1_vec;
+    phi::AlignedVector<T, VecSize> mom2_vec;
+    phi::AlignedVector<T, VecSize> trust_ratio_div_vec;
 
-    T mom1_unbiased = mom1 / one_minus_beta1pow;
-    T mom2_unbiased = mom2 / one_minus_beta2pow;
-    T trust_ratio_div =
-        mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p;
+    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
+    if (cur_weight_decay != static_cast<T>(0.0)) {
+      phi::Load(param_p + i, &param_vec);
+    } else {
+#pragma unroll
+      for (int j = 0; j < VecSize; ++j) {
+        param_vec[j] = static_cast<T>(0);
+      }
+    }
+    phi::Load(grad_p + i, &grad_vec);
+    phi::Load(mom1_p + i, &mom1_vec);
+    phi::Load(mom2_p + i, &mom2_vec);
+
+#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2,    \
+                                           __trust_ratio_div, __idx)           \
+  T p = __param[__idx];                                                        \
+  T g = static_cast<T>(__grad[__idx]) * scale;                                 \
+  T mom1 = __mom1[__idx];                                                      \
+  T mom2 = __mom2[__idx];                                                      \
+  mom1 = beta1 * mom1 + (1 - beta1) * g;                                       \
+  mom2 = beta2 * mom2 + (1 - beta2) * g * g;                                   \
+  T mom1_unbiased = mom1 / one_minus_beta1pow;                                 \
+  T mom2_unbiased = mom2 / one_minus_beta2pow;                                 \
+  __trust_ratio_div[__idx] =                                                   \
+      mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \
+  __mom1[__idx] = mom1;                                                        \
+  __mom2[__idx] = mom2;
+
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec,
+                                         mom2_vec, trust_ratio_div_vec, j);
+    }
+
+    phi::Store(mom1_vec, mom1_p + i);
+    phi::Store(mom2_vec, mom2_p + i);
+    phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
+  }
 
-    mom1_p[i] = mom1;
-    mom2_p[i] = mom2;
-    trust_ratio_div_p[i] = trust_ratio_div;
+  for (; i < num; ++i) {
+    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
+    PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p,
+                                       trust_ratio_div_p, i);
   }
 }
 
+template <typename T, typename GradT>
+static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
+    const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
+    const T *param_p, const GradT *grad_p, const T *square_grad_norm_p,
+    const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p,
+    T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay,
+    int weight_decay_end_idx, T beta1, T beta2, T epsilon,
+    T max_global_grad_norm, T rescale_grad) {
+  if (n <= 0) return;
+  int numel = offsets[n] - offsets[0];
+  PADDLE_ENFORCE_GE(weight_decay_end_idx, 0,
+                    platform::errors::InvalidArgument(
+                        "The weight decay end index should be >= 0."));
+  PADDLE_ENFORCE_LE(weight_decay_end_idx, n,
+                    platform::errors::InvalidArgument(
+                        "The weight decay end index should be < %d.", n));
+  auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0];
+
+  int vec_size = GetChunkedVecSize(param_p, 0);
+  vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0));
+  for (int i = 0; i < n; ++i) {
+    auto length = offsets[i + 1] - offsets[i];
+    while (length % vec_size != 0) {
+      vec_size /= 2;
+    }
+  }
+
+  VLOG(1) << __func__ << " VecSize = " << vec_size;
+
+  auto stream = dev_ctx.stream();
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+
+#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                      \
+  do {                                                                 \
+    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<<  \
+        config.block_per_grid, config.thread_per_block, 0, stream>>>(  \
+        param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \
+        beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p,    \
+        weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,   \
+        max_global_grad_norm, numel, rescale_grad);                    \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
+#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
+}
+
 template <typename T, bool NeedUpdate /*=true*/>
 struct LambBetaPowUpdateOnceHelper {
   LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) {
@@ -468,33 +552,6 @@ struct LambBetaPowUpdateOnceHelper<T, false> {
   HOSTDEVICE void UpdateBetaPows() const {}
 };
 
-template <bool HasFoundInf /*=true*/>
-struct LambFoundInfHelper {
- public:
-  explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) {
-    PADDLE_ENFORCE_NOT_NULL(found_inf,
-                            platform::errors::InvalidArgument(
-                                "The found_inf should not be nullptr."));
-  }
-
-  HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; }
-
- private:
-  bool *__restrict__ found_inf_;
-};
-
-template <>
-struct LambFoundInfHelper<false> {
- public:
-  explicit LambFoundInfHelper(bool *found_inf) {
-    PADDLE_ENFORCE_EQ(
-        found_inf, nullptr,
-        platform::errors::InvalidArgument("The found_inf should be nullptr."));
-  }
-
-  HOSTDEVICE void UpdateFoundInf(bool) {}
-};
-
 template <typename T, bool HasMasterParam /*=true*/>
 struct LambParamHelper {
   LambParamHelper(T *param, MasterT<T> *master_param) {
@@ -509,12 +566,9 @@ struct LambParamHelper {
     master_param_ = master_param;
   }
 
-  HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
-    param_[i] = static_cast<T>(updated_p);
-    master_param_[i] = updated_p;
-  }
+  HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
 
-  HOSTDEVICE MasterT<T> GetParam(int i) { return master_param_[i]; }
+  HOSTDEVICE MasterT<T> *__restrict__ MasterParamPtr() { return master_param_; }
 
  private:
   T *__restrict__ param_;
@@ -538,158 +592,169 @@ struct LambParamHelper<T, false> {
     param_ = param;
   }
 
-  HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
-    param_[i] = static_cast<T>(updated_p);
-  }
+  HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
 
-  HOSTDEVICE MasterT<T> GetParam(int i) {
-    return static_cast<MasterT<T>>(param_[i]);
-  }
+  HOSTDEVICE constexpr MasterT<T> *MasterParamPtr() { return nullptr; }
 
  private:
   T *__restrict__ param_;
 };
 
-template <typename ParamT, typename IndexT, bool HasMasterParam,
-          bool NeedUpdateBetaPow, bool HasFoundInf>
-struct LambParamAndBetaPowsUpdateHelper
-    : public LambParamHelper<ParamT, HasMasterParam>,
-      public LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>,
-      public LambFoundInfHelper<HasFoundInf> {
-  LambParamAndBetaPowsUpdateHelper(
-      ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
-      MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
-      bool *found_inf, const MasterT<ParamT> *trust_ratio_div,
-      const MasterT<ParamT> *lr, const IndexT *index,
+template <typename ParamT, bool HasMasterParam, bool NeedUpdateBetaPow,
+          int VecSize>
+struct LambUpdateParamAndBetaPowsFunctor {
+  DEVICE void operator()(
+      int tensor_id, int chunk_id, int offset, int size,
+      LambParamHelper<ParamT, HasMasterParam> param_helper,
+      const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
       const MasterT<ParamT> *param_square_norm,
-      const MasterT<ParamT> *trust_ratio_div_square_norm,
-      const MasterT<ParamT> *update_flag)
-      : LambParamHelper<ParamT, HasMasterParam>(param, master_param),
-        LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>(
-            beta1pow, beta2pow, beta1, beta2),
-        LambFoundInfHelper<HasFoundInf>(found_inf),
-        trust_ratio_div(trust_ratio_div),
-        lr(lr),
-        index(index),
-        param_square_norm(param_square_norm),
-        trust_ratio_div_square_norm(trust_ratio_div_square_norm),
-        update_flag(update_flag) {}
-
-  const MasterT<ParamT> *__restrict__ trust_ratio_div;
-  const MasterT<ParamT> *__restrict__ lr;
-  const IndexT *__restrict__ index;
-  const MasterT<ParamT> *__restrict__ param_square_norm;
-  const MasterT<ParamT> *__restrict__ trust_ratio_div_square_norm;
-  const MasterT<ParamT> *__restrict__ update_flag;
-};
+      const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
+      LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>
+          betapow_helper) const {
+    if (*found_inf) return;
 
-template <typename ParamT, typename IndexT, bool HasMasterParam,
-          bool NeedUpdateBetaPow, bool HasFoundInf>
-static __global__ void LambUpdateParamAndBetaPowsCUDAKernel(
-    LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, HasMasterParam,
-                                     NeedUpdateBetaPow, HasFoundInf>
-        args,
-    int num) {
-  auto should_update = *args.update_flag;
-  if (!isfinite(should_update)) {
-    if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
-      args.UpdateFoundInf(true);
+    using MT = MasterT<ParamT>;
+
+    MT p_square_norm = param_square_norm[tensor_id];
+    MT t_square_norm = trust_ratio_div_square_norm[tensor_id];
+    MT lr_value = *lr;
+    MT ratio = (p_square_norm != static_cast<MT>(0) &&
+                        t_square_norm != static_cast<MT>(0)
+                    ? lr_value * sqrtf(p_square_norm / t_square_norm)
+                    : lr_value);
+
+    int i;
+    int stride = blockDim.x * VecSize;
+
+    ParamT *param = param_helper.ParamPtr() + offset;
+    MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset
+                                      : param_helper.MasterParamPtr();
+    trust_ratio_div += offset;
+
+    for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
+      phi::AlignedVector<MT, VecSize> trust_ratio_div_vec;
+      phi::Load(trust_ratio_div + i, &trust_ratio_div_vec);
+      if (HasMasterParam) {
+        phi::AlignedVector<MT, VecSize> master_param_vec;
+        phi::Load(master_param + i, &master_param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
+          master_param_vec[j] = p;
+          param_vec[j] = static_cast<ParamT>(p);
+        }
+        phi::Store(master_param_vec, master_param + i);
+        phi::Store(param_vec, param + i);
+      } else {
+        phi::AlignedVector<ParamT, VecSize> param_vec;
+        phi::Load(param + i, &param_vec);
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
+          param_vec[j] = static_cast<ParamT>(p);
+        }
+        phi::Store(param_vec, param + i);
+      }
+    }
+
+    for (; i < size; ++i) {
+      if (HasMasterParam) {
+        MT p = master_param[i] - ratio * trust_ratio_div[i];
+        master_param[i] = p;
+        param[i] = static_cast<ParamT>(p);
+      } else {
+        MT p = static_cast<MT>(param[i]) - ratio * trust_ratio_div[i];
+        param[i] = static_cast<ParamT>(p);
+      }
+    }
+
+    if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
+      betapow_helper.UpdateBetaPows();
     }
-    return;
-  } else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
-    args.UpdateFoundInf(false);
   }
+};
 
-  if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
-    args.UpdateBetaPows();
+// TODO(zengjinle): which block_dim and chunk_size would be better?
+template <typename ParamT, int MaxTensorNumPerLaunch = 160,
+          int MaxChunkNumPerLaunch = 780>
+static void MultiTensorUpdateLambParamAndBetaPows(
+    const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
+    const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
+    const MasterT<ParamT> *param_square_norm,
+    const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
+    ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
+    MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
+    int chunk_size = 65536) {
+  constexpr bool kHasMasterParam =
+      !(std::is_same<ParamT, MasterT<ParamT>>::value);
+
+  bool has_beta_pow = (beta1pow != nullptr);
+  if (has_beta_pow) {
+    PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
+                                          "Beta2Pow should not be nullptr."));
+  } else {
+    PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
+                                             "Beta2Pow should be nullptr."));
   }
 
-  using MT = MasterT<ParamT>;
+  const int block_dim = 512;
 
-  MT lr_value = *args.lr;
-  CUDA_KERNEL_LOOP(i, num) {
-    MT p = args.GetParam(i);
-    MT t = args.trust_ratio_div[i];
-    auto norm_idx = args.index[i];
-    MT p_square_norm = args.param_square_norm[norm_idx];
-    MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx];
+  int vec_size = 8;
+  for (int i = 0; i < n; ++i) {
+    int offset = offsets[i] - offsets[0];
+    vec_size =
+        std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size));
+    if (kHasMasterParam) {
+      vec_size = std::min(vec_size,
+                          GetChunkedVecSize(master_param + offset, chunk_size));
+    }
+    vec_size = std::min(
+        vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size));
+  }
 
-    MT p_norm = static_cast<MT>(sqrtf(p_square_norm));
-    MT t_norm = static_cast<MT>(sqrtf(t_square_norm));
+  VLOG(1) << __func__ << " VecSize = " << vec_size;
 
-    auto update = (p_norm != static_cast<MT>(0) && t_norm != static_cast<MT>(0))
-                      ? p_norm / t_norm
-                      : static_cast<MT>(1);
+  constexpr auto kNumTensor = MaxTensorNumPerLaunch;
+  constexpr auto kNumChunk = MaxChunkNumPerLaunch;
 
-    MT updated_p = p - lr_value * update * t;
-    args.SetParam(i, updated_p);
-  }
-}
+  auto stream = dev_ctx.stream();
+#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow)            \
+  do {                                                                         \
+    using FunctorT =                                                           \
+        LambUpdateParamAndBetaPowsFunctor<ParamT, kHasMasterParam,             \
+                                          __has_beta_pow, kVecSize>;           \
+    LambParamHelper<ParamT, kHasMasterParam> param_helper(param,               \
+                                                          master_param);       \
+    LambBetaPowUpdateOnceHelper<MasterT<ParamT>, __has_beta_pow>               \
+        betapow_helper(beta1pow, beta2pow, beta1, beta2);                      \
+    launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr,             \
+                    param_square_norm, trust_ratio_div_square_norm, found_inf, \
+                    betapow_helper);                                           \
+  } while (0)
 
-template <typename ParamT, typename IndexT>
-static void LambUpdateParamAndBetaPows(
-    const platform::CUDADeviceContext &dev_ctx,
-    const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
-    const IndexT *index, const MasterT<ParamT> *param_square_norm,
-    const MasterT<ParamT> *trust_ratio_div_square_norm,
-    const MasterT<ParamT> *update_flag, MasterT<ParamT> **beta1pow,
-    MasterT<ParamT> **beta2pow, bool **found_inf, MasterT<ParamT> beta1,
-    MasterT<ParamT> beta2, int num, ParamT *param,
-    MasterT<ParamT> *master_param, gpuStream_t stream) {
-  if (num == 0) return;
-
-  bool has_master_param = !(std::is_same<ParamT, MasterT<ParamT>>::value);
-  auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr;
-  auto has_found_inf = (*found_inf) != nullptr;
-
-#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(                              \
-    __has_master_param, __has_beta_pow, __has_found_inf)                     \
-  do {                                                                       \
-    LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, __has_master_param,     \
-                                     __has_beta_pow, __has_found_inf>        \
-        helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2,      \
-               *found_inf, trust_ratio_div, lr, index, param_square_norm,    \
-               trust_ratio_div_square_norm, update_flag);                    \
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num);              \
-    LambUpdateParamAndBetaPowsCUDAKernel<<<                                  \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \
-                                                                     num);   \
+#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE        \
+  do {                                                              \
+    auto callback = [&](                                            \
+        const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
+        int launch_n) {                                             \
+      if (has_beta_pow && launch_n == 0) {                          \
+        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
+        beta1pow = nullptr;                                         \
+        beta2pow = nullptr;                                         \
+      } else {                                                      \
+        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
+      }                                                             \
+    };                                                              \
+    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(            \
+        stream, offsets, n, chunk_size, block_dim, callback);       \
   } while (0)
 
-  if (has_master_param) {
-    if (has_beta_pow) {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false);
-      }
-    } else {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false);
-      }
-    }
-  } else {
-    if (has_beta_pow) {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false);
-      }
-    } else {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false);
-      }
-    }
-  }
+  PD_VEC_LAUNCH_KERNEL(vec_size,
+                       PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE);
 
-  *beta1pow = nullptr;
-  *beta2pow = nullptr;
-  *found_inf = nullptr;
-#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL
+#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW
+#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -710,6 +775,24 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype,
   return false;
 }
 
+template <typename T1, typename T2>
+static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
+                              const T1 *x, const T2 *scale, T1 *y, int n,
+                              gpuStream_t stream) {
+  int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0));
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
+
+#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                          \
+  do {                                                                         \
+    ScaleCUDAKernel<T1, T2, kVecSize><<<config.block_per_grid,                 \
+                                        config.thread_per_block, 0, stream>>>( \
+        x, scale, y, n);                                                       \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE);
+#undef PD_LAMB_VEC_SCALE_KERNEL_CASE
+}
+
 template <typename T>
 static void NCCLReduceScatterWithScale(
     const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks,
@@ -725,10 +808,8 @@ static void NCCLReduceScatterWithScale(
       PADDLE_ENFORCE_EQ(nranks, 1,
                         platform::errors::InvalidArgument(
                             "nranks must be 1 when scale != nullptr."));
-      auto numel = recvcount * nranks;
-      auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel);
-      ScaleCUDAKernel<<<config.block_per_grid, config.thread_per_block, 0,
-                        stream>>>(sendbuff, scale, recvbuff, numel);
+      LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks,
+                        stream);
     }
     return;
   }
@@ -742,9 +823,7 @@ static void NCCLReduceScatterWithScale(
   if (scale && !should_destroy_op) {
     size_t numel = recvcount * nranks;
     T *new_sendbuff = buffer.Alloc<T>(numel);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel);
-    ScaleCUDAKernel<<<config.block_per_grid, config.thread_per_block, 0,
-                      stream>>>(sendbuff, scale, new_sendbuff, numel);
+    LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
     sendbuff = new_sendbuff;
   }
 
@@ -1005,15 +1084,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "Too many parameter number. Only <= %d is supported.",
                           std::numeric_limits<int>::max()));
 
-    // Step 3: Get FusedIndices, ParamInfo
-    const auto *indices = GetInputTensorPtr<int>(ctx, "FusedIndices");
+    // Step 3: Get ParamInfo
     const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
     auto fp32_local_start_idx = param_info_tensor[0];
     auto fp32_local_param_num = param_info_tensor[1];
     auto fp32_global_param_num = param_info_tensor[2];
-    auto fp16_local_start_idx = param_info_tensor[3];
-    auto fp16_local_param_num = param_info_tensor[4];
-    auto fp16_global_param_num = param_info_tensor[5];
+    auto fp32_weight_decay_end_idx = param_info_tensor[3];
+    auto fp16_local_start_idx = param_info_tensor[4];
+    auto fp16_local_param_num = param_info_tensor[5];
+    auto fp16_global_param_num = param_info_tensor[6];
+    auto fp16_weight_decay_end_idx = param_info_tensor[7];
 
     auto local_param_num = fp32_local_param_num + fp16_local_param_num;
     auto param_num = fp32_global_param_num + fp16_global_param_num;
@@ -1031,7 +1111,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             << " , fp16_global_param_num = " << fp16_global_param_num;
 
     // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
-    // WeightDecay, GlobalScale, FoundInf
+    // GlobalScale, FoundInf
     const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
     const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
     int64_t partial_numel = 0;
@@ -1065,14 +1145,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         GetSameInOutTensorPtr<float>(ctx, place, "Beta1Pow", "Beta1PowOut");
     auto *beta2pow =
         GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
-    const float *weight_decay = GetInputTensorPtr<float>(ctx, "WeightDecay");
 
     auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
     found_inf_t->Resize({1});
     auto *found_inf = found_inf_t->mutable_data<bool>(place);
 
-    // Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id,
+    // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
+    // max_grad_norm, ring_id,
     // use_master_param_norm, is_grad_scaled_by_nranks
+    auto weight_decay = ctx.Attr<float>("weight_decay");
     auto beta1 = ctx.Attr<float>("beta1");
     auto beta2 = ctx.Attr<float>("beta2");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -1105,7 +1186,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     platform::float16 *fp16_sum_grad;
     auto fp32_numel_each_device = fp32_numel / num_devices;
     auto fp16_numel_each_device = fp16_numel / num_devices;
-    if (num_devices > 1) {
+    if (num_devices > 1 ||
+        (max_global_grad_norm > 0 && !clip_after_allreduce)) {
       auto ptr = sum_grad_buffer.Alloc<uint8_t>(
           fp32_numel_each_device * sizeof(float) +
           fp16_numel_each_device * sizeof(platform::float16));
@@ -1181,7 +1263,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             float, platform::float16><<<1, 1, 0, stream>>>(
             global_scale, max_global_grad_norm, fp32_square_grad_norm,
             fp32_scale, fp16_scale, clip_scale);
-        VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
+        if (fp32_scale) {
+          VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
+        } else {
+          VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
+        }
         if (num_devices > 1) {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
               fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
@@ -1218,36 +1304,56 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     VLOG(10) << "ReduceScatter done";
 
     // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div
+    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
+    auto *fused_offsets = fused_offsets_t->data<int>();
+    auto *fp32_partial_fused_offsets_t =
+        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
+    const auto *fp32_partial_fused_offsets =
+        fp32_partial_fused_offsets_t->data<int>();
+    auto *fp16_partial_fused_offsets_t =
+        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
+    const auto *fp16_partial_fused_offsets =
+        fp16_partial_fused_offsets_t->data<int>();
+
+    VLOG(1) << "FusedParamOffsets: "
+            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
+                               fused_offsets_t->place());
+    VLOG(1) << "FP32ShardFusedParamOffsets: "
+            << FlattenToString(fp32_partial_fused_offsets,
+                               fp32_partial_fused_offsets_t->numel(),
+                               fp32_partial_fused_offsets_t->place());
+    VLOG(1) << "FP16ShardFusedParamOffsets: "
+            << FlattenToString(fp16_partial_fused_offsets,
+                               fp16_partial_fused_offsets_t->numel(),
+                               fp16_partial_fused_offsets_t->place());
+
     memory::Buffer trust_ratio_div_buffer(place);
     auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
     auto fp32_offset = rank * fp32_numel_each_device;
     auto fp16_offset = rank * fp16_numel_each_device;
     if (has_fp32_param) {
-      auto config =
-          platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device);
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts";
-      UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
-                         stream>>>(
+      MultiTensorUpdateLambMomentAndTrustRatioDiv(
+          dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
           fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm,
-          global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow,
-          moment1, moment2, trust_ratio_div, beta1, beta2, epsilon,
-          max_global_grad_norm, fp32_numel_each_device, rescale_grad);
+          global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div,
+          found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2,
+          epsilon, max_global_grad_norm, rescale_grad);
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv done";
     }
     float *master_param = nullptr;
     if (has_fp16_param) {
       master_param = fp32_param + fp32_numel;
-      auto config =
-          platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device);
       VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts";
-      UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
-                         stream>>>(
+      auto tmp_found_inf = has_fp32_param ? nullptr : found_inf;
+      MultiTensorUpdateLambMomentAndTrustRatioDiv(
+          dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
           master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm,
-          global_scale, indices + fp32_numel + fp16_offset, weight_decay,
-          beta1pow, beta2pow, moment1 + fp32_numel_each_device,
+          global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device,
           moment2 + fp32_numel_each_device,
-          trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon,
-          max_global_grad_norm, fp16_numel_each_device, rescale_grad);
+          trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay,
+          fp16_weight_decay_end_idx, beta1, beta2, epsilon,
+          max_global_grad_norm, rescale_grad);
       VLOG(10) << "Update FP16 Moment and TrustRatioDiv done";
     }
 
@@ -1257,30 +1363,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     memory::Buffer square_norm_buffer(place);
     auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
     auto *trust_ratio_div_square_norm = param_square_norm + param_num;
-
-    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
-    auto *fused_offsets = fused_offsets_t->data<int>();
-    auto *fp32_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
-    const auto *fp32_partial_fused_offsets =
-        fp32_partial_fused_offsets_t->data<int>();
-    auto *fp16_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
-    const auto *fp16_partial_fused_offsets =
-        fp16_partial_fused_offsets_t->data<int>();
-
-    VLOG(1) << "FusedParamOffsets: "
-            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
-                               fused_offsets_t->place());
-    VLOG(1) << "FP32ShardFusedParamOffsets: "
-            << FlattenToString(fp32_partial_fused_offsets,
-                               fp32_partial_fused_offsets_t->numel(),
-                               fp32_partial_fused_offsets_t->place());
-    VLOG(1) << "FP16ShardFusedParamOffsets: "
-            << FlattenToString(fp16_partial_fused_offsets,
-                               fp16_partial_fused_offsets_t->numel(),
-                               fp16_partial_fused_offsets_t->place());
-
     if (num_devices > 1) {
       if (use_master_param_norm) {
         FillZeroWithPtr(param_square_norm + fp32_global_param_num,
@@ -1296,11 +1378,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                         fp16_partial_fused_offsets, fp16_local_param_num,
                         param_square_norm + fp16_local_start_idx);
     } else {
-      // NOTE: extra computation is performed. We can improve this performance
-      // if needed in the future.
       MultiTensorL2Norm(
-          place, stream, fp16_param, fused_offsets + fp32_global_param_num,
-          fp16_global_param_num, param_square_norm + fp32_global_param_num);
+          place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
+                             fused_offsets[fp32_global_param_num],
+          fused_offsets + fp16_local_start_idx, fp16_local_param_num,
+          param_square_norm + fp16_local_start_idx);
     }
 
     MultiTensorL2Norm(place, stream, trust_ratio_div,
@@ -1333,26 +1415,29 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
     // Step 9: update parameter, beta1pow, beta2pow. All gather parameters.
     if (has_fp32_param) {
-      LambUpdateParamAndBetaPows<float>(
-          dev_ctx, trust_ratio_div, lr, indices + fp32_offset,
-          param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm,
-          &beta1pow, &beta2pow, &found_inf, beta1, beta2,
-          fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream);
+      MultiTensorUpdateLambParamAndBetaPows<float>(
+          dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
+          trust_ratio_div, lr, param_square_norm + fp32_local_start_idx,
+          trust_ratio_div_square_norm + fp32_local_start_idx, found_inf,
+          fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2);
       if (num_devices > 1) {
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
             fp32_param + fp32_offset, fp32_param, fp32_numel_each_device,
             ncclFloat32, comm, stream));
       }
+
+      beta1pow = nullptr;
+      beta2pow = nullptr;
     }
     if (has_fp16_param) {
-      LambUpdateParamAndBetaPows<platform::float16>(
-          dev_ctx, trust_ratio_div + fp32_numel_each_device, lr,
-          indices + fp32_numel + fp16_offset, param_square_norm,
-          trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow,
-          &beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device,
-          fp16_param + fp16_offset, master_param + fp16_offset, stream);
-
+      MultiTensorUpdateLambParamAndBetaPows<platform::float16>(
+          dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
+          trust_ratio_div + fp32_numel_each_device, lr,
+          param_square_norm + fp16_local_start_idx,
+          trust_ratio_div_square_norm + fp16_local_start_idx, found_inf,
+          fp16_param + fp16_offset, master_param + fp16_offset, beta1pow,
+          beta2pow, beta1, beta2);
       if (num_devices > 1) {
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index df5da1b79535cc6f5e4a638e9d32c367ea7cdb9f..fe5cd066864b82c734614e33869dff1734bee6d0 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate(
     T* param_out, MT* velocity_out, const MT mu, MT local_lr,
     const MT lars_weight_decay, const MT rescale_grad, const int tid,
     const int grid_stride, const int numel, MT* master_param_out = nullptr) {
-  using VecType = paddle::platform::AlignedVector<T, VecSize>;
-  using VecMType = paddle::platform::AlignedVector<MT, VecSize>;
+  using VecType = phi::AlignedVector<T, VecSize>;
+  using VecMType = phi::AlignedVector<MT, VecSize>;
   int main = numel >> (VecSize >> 1);
   int tail_offset = main * VecSize;
 
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
index 5d8d03c733dae210e8a41a8ad78a258df558b341..179e8f452545c437e373e42d59d18f524f260cd5 100644
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
           args...);
 }
 
-template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
-          int MaxChunkNumPerLaunch, typename... Args>
-static void MultiTensorApply(Functor functor, gpuStream_t stream,
-                             const int *offsets, int n, int chunk_size,
-                             Args... args) {
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
+class MultiTensorLauncher {
+ public:
+  MultiTensorLauncher(
+      const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
+      const int &chunk_id, const int &chunk_size, const int &block_dim,
+      const gpuStream_t &stream)
+      : meta_(meta),
+        chunk_id_(chunk_id),
+        chunk_size_(chunk_size),
+        block_dim_(block_dim),
+        stream_(stream) {}
+
+  template <typename Functor, typename... Args>
+  void Launch(Functor &&functor, Args &&... args) const {
+    MultiTensorApplyCUDAKernel<
+        Functor, MaxTensorNumPerLaunch,
+        MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
+        functor, meta_, chunk_size_, args...);
+  }
+
+ private:
+  const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
+  const int &chunk_id_;
+  const int &chunk_size_;
+  const int &block_dim_;
+  const gpuStream_t &stream_;
+};
+
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename Callback>
+static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
+                                         int n, int chunk_size, int block_dim,
+                                         Callback &&callback) {
   if (n == 0) return;
 
   constexpr auto NumTensor = MaxTensorNumPerLaunch;
@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
   int numel_offset = 0;
   metas.start_tensor_id = 0;
   metas.start_chunk_id = 0;
+  int launch_num = 0;
+
+  MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
+      metas, chunk_id, chunk_size, block_dim, stream);
+
   for (int i = 0; i < n; ++i) {
     auto length = offsets[i + 1] - offsets[i];
     if (tensor_id == 0) {
@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
       bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
 
       if (tensor_full || block_full || last_chunk) {
-        MultiTensorApplyCUDAKernel<Functor, NumTensor,
-                                   NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
-            functor, metas, chunk_size, args...);
+        callback(launcher, launch_num);
+        ++launch_num;
         chunk_id = 0;
         if (j + 1 == chunk_num) {  // chunk for the current tensor is full
           metas.start_chunk_id = 0;
@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
   }
 }
 
+template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename... Args>
+static void MultiTensorApply(Functor functor, gpuStream_t stream,
+                             const int *offsets, int n, int chunk_size,
+                             int block_dim, Args &&... args) {
+  auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
+                                                MaxChunkNumPerLaunch> &launcher,
+                      int i) { launcher.Launch(functor, args...); };
+  MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
+      stream, offsets, n, chunk_size, block_dim, callback);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 529d60a2820ea92de0b0009b31c9f2ad04d4860a..0e3f895d276af6856c64ddd123606b087689ca9a 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -166,8 +166,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::SGDOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 3149f5f56ed4964a750f61a354c6cd31a29fc526..222244a2fd1e34ace573ad4fa06775c0e5113925 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -166,10 +166,3 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
 };
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index f2cb427a0a5b139e1ccdf960afeb6db4bcb8b5a5..d0b78b9b0643d6c5dc5b4bfeac2cf792ac349194 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
 __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
   return static_cast<platform::float16>(abs(static_cast<float>(x)));
 }
+
+__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
+  return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
+}
+
 __device__ __forceinline__ float inline_abs(float x) { return abs(x); }
 __device__ __forceinline__ double inline_abs(double x) { return abs(x); }
 
@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
   return static_cast<platform::float16>(
       pow(static_cast<float>(base), static_cast<float>(exponent)));
 }
+__device__ __forceinline__ platform::bfloat16 inline_pow(
+    platform::bfloat16 base, platform::bfloat16 exponent) {
+  return static_cast<platform::bfloat16>(
+      pow(static_cast<float>(base), static_cast<float>(exponent)));
+}
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(p_norm,
                         ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
+                        ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
                         ops::PnormCUDAKernel<CUDA, float>,
                         ops::PnormCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(
     p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
+    ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
     ops::PnormGradCUDAKernel<CUDA, float>,
     ops::PnormGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 5df167fdf726345074cdc40afd0c5b394467578f..0aedd800e1a237d4baf0092eef9bac9f7dbe862d 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/padding.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
       pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
     }
 
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
-                                           *in_y, out);
+    phi::funcs::PaddingFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, pad_value,
+        *in_y, out);
   }
 };
 
@@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
       pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
     }
 
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
-                                               d_y);
+    phi::funcs::PaddingGradFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, *in_dout,
+        d_y);
   }
 };
 
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 39acba7e58aba51942d7d8de2d89e2783fd591f9..dc162ae5782f2690fcf6378603268369e4aeb9ca 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pad_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE_EQ(
-        static_cast<int>(paddings.size()), x_dim.size() * 2,
-        platform::errors::InvalidArgument(
-            "Size of 'paddings' dimension should be equal to 2 * size of "
-            "Input(X)'s dimension, but received (size of 'paddings' dimension "
-            "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
-            static_cast<int>(paddings.size()), x_dim.size() * 2));
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      PADDLE_ENFORCE_GE(paddings[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The element of 'paddings' should >= 0, but "
-                            "received %d for index %d.",
-                            paddings[i], static_cast<int>(i)));
-    }
-    std::vector<int64_t> out_dims(x_dim.size());
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
-        out_dims[i] = -1;
-      } else {
-        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
   }
 };
 
@@ -160,47 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor,
+                            PD_INFER_META(phi::PadInferMeta));
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
                   ops::PadOpGradMaker<paddle::framework::OpDesc>,
-                  ops::PadOpGradMaker<paddle::imperative::OpBase>);
+                  ops::PadOpGradMaker<paddle::imperative::OpBase>,
+                  PadInferShapeFunctor);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
                   ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
deleted file mode 100644
index d494c954e1ef73b585761acf7490a5e35beccac4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/padding.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class PadKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    float pad_value = context.Attr<float>("pad_value");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    int rank = x->dims().size();
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
-                                           static_cast<T>(pad_value), *x, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PadGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    if (d_x == nullptr) {
-      return;
-    }
-
-    d_x->mutable_data<T>(context.GetPlace());
-    int rank = d_out->dims().size();
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
-                                               d_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 2a127d9ad1db0c1e169fdd1e20a1568b99d228a0..21ca26f49f653d03e2710937d360091e0c4536df 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
-                            PT_INFER_META(phi::PixelShuffleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
+                            PD_INFER_META(phi::PixelShuffleInferMeta));
 
 REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index 0cecbf0b9cb027f7032b7b20fb10ef06a79503df..d5896c4105932ef7327d7093a15cf50e87308ae5 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
                   ops::PoissonOpInferVarType,
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index e0c24935b47509dbe473a963240f4234e168a293..d061f9ae05613491cbdbff3793b57a3d89d7d6e5 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -81,8 +81,12 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
       output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
     } else {
       for (size_t i = 0; i < ksize.size(); ++i) {
-        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                 paddings[i], strides[i]));
+        if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) {
+          output_shape.push_back(in_x_dims[i + 2]);
+        } else {
+          output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                   paddings[i], strides[i]));
+        }
       }
     }
     ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index da637dfeb237dd4f17816e784882720dc2f2ff64..cfacffff234105ac9c6dc41b86f06594d319dcbb 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
 class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(ROIs) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of PSROIPoolOp should not be null."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], 4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_EQ(
-        input_dims[1], output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "the channel of X(%d) "
-            "should be equal to the product of "
-            "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-            input_dims[1], output_channels, pooled_height, pooled_width));
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      platform::errors::InvalidArgument(
-                          "The pooled output channels must greater than 1"));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of Out should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of X should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolGradInferMeta));
 REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
                   ops::PSROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool_grad,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>,
+                  PsroiPoolInferShapeFunctor);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp,
+                  PsroiPoolGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
deleted file mode 100644
index c1917501db8b5afebf4b7951b0f04de69758b49d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T outsum = 0;
-
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        outsum += offset_input_data[input_index];
-      }
-    }
-
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    output_data[i] = is_empty ? 0. : outsum / bin_area;
-  }
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Accumulate diff_val into input data
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(
-        input_channels, output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "The channels %d of input X should equal the product of "
-            "output_channels %d x pooled_height %d x pooled_width %d.",
-            input_channels, output_channels, pooled_height, pooled_width));
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-    int rois_batch_size;
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                   rois_num_data, sizeof(int) * rois_batch_size, 0);
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_list[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-
-      // set rois batch id
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPSROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPSROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool_grad,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
deleted file mode 100644
index 3f020d93391b0e648898c1b83858a7bd9809aa03..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      platform::errors::InvalidArgument(
-                          "the channels of input "
-                          "X should equal the product of "
-                          "output_channels x pooled_height x pooled_width"));
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_data[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
-                                            "batch_size should be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and lod must be the same"));
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate psroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w =
-          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-      T roi_start_h =
-          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-      T roi_end_w =
-          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-      T roi_end_h =
-          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-      // Force too small rois to be 1 x 1
-      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-      // Compute bin size w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
-            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
-            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-            //  Add roi offsets and clip to input boundaries
-            hstart = std::min(std::max(hstart, 0), height);
-            wstart = std::min(std::max(wstart, 0), width);
-            hend = std::min(std::max(hend, 0), height);
-            wend = std::min(std::max(wend, 0), width);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T out_sum = 0.;
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            for (int ih = hstart; ih < hend; ++ih) {
-              for (int iw = wstart; iw < wend; ++iw) {
-                int input_index = ih * in_stride[2] + iw;
-                out_sum += offset_input_data[input_index];
-              }
-            }
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w =
-            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-        T roi_start_h =
-            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-        T roi_end_w =
-            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-        T roi_end_h =
-            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-        // Force too small ROIs to be 1x1
-        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-        // Add roi offsets and clip to input boundaries
-        hstart = std::min(std::max(hstart, 0), height);
-        hend = std::min(std::max(hend, 0), height);
-        wstart = std::min(std::max(wstart, 0), width);
-        wend = std::min(std::max(wend, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        // Accumulate diff_val into input data
-        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-        for (int ih = hstart; ih < hend; ++ih) {
-          for (int iw = wstart; iw < wend; ++iw) {
-            int input_index = ih * width + iw;
-            offset_input_grad_data[input_index] += diff_val;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
index 6b0d6f332bcae8890cdfaccb1244886daa63ae42..54e31845ad4bd5ddfa81bc90a10391f027dffc11 100644
--- a/paddle/fluid/operators/put_along_axis_op.cc
+++ b/paddle/fluid/operators/put_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/put_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker,
                   paddle::operators::PutAlongAxisInplaceInferer);
 
 REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel<float>,
-                       ops::PutAlongAxisOpKernel<double>,
-                       ops::PutAlongAxisOpKernel<int>,
-                       ops::PutAlongAxisOpKernel<uint8_t>,
-                       ops::PutAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis_grad,
-                       ops::PutAlongAxisGradOpKernel<float>,
-                       ops::PutAlongAxisGradOpKernel<double>,
-                       ops::PutAlongAxisGradOpKernel<int>,
-                       ops::PutAlongAxisGradOpKernel<uint8_t>,
-                       ops::PutAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
deleted file mode 100644
index 5508023efad2c60a00f5ea3a8d1b853c6e5ba1fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/put_along_axis_op.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/put_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PutAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisCUDAKernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_input_grad_kernel<T, int32_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        gpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_gather_kernel<T, int32_t>(
-            *result_grad, axis, *index, *value_grad,
-            ctx.device_context());  // the gradient of scatter is gather
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel<float>,
-                        ops::PutAlongAxisCUDAKernel<double>,
-                        ops::PutAlongAxisCUDAKernel<int64_t>,
-                        ops::PutAlongAxisCUDAKernel<int>,
-                        ops::PutAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(put_along_axis_grad,
-                        ops::PutAlongAxisGradOpCUDAKernel<float>,
-                        ops::PutAlongAxisGradOpCUDAKernel<double>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int>,
-                        ops::PutAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
deleted file mode 100644
index 38487f5ce28c9e35dd6e84403b88dbc0fdfa07b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/put_along_axis_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class PutAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisOpKernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce "
-          "op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpKernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_input_grad_kernel<T, int32_t>(
-            // Here passing an unused argument *result_grad, because it's
-            // convenient to instantiate a bunch of template function with the
-            // same arguments list.
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        cpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_gather_kernel<T, int32_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 5e841a097fed76d0ee5582c40ce417e24fb4a739..a57a8d5cf8b7f65a892ce9465ce03bd3c9519f1c 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -56,13 +56,13 @@ class QrGPUKernel : public framework::OpKernel<T> {
     int tau_stride = min_mn;
 
     if (compute_q) {
-      q.mutable_data<phi::funcs::Real<T>>(
+      q.mutable_data<phi::dtype::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
     }
-    r.mutable_data<phi::funcs::Real<T>>(
+    r.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     auto dito =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
@@ -71,9 +71,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
     // Note: allocate temporary tensors because of lacking in-place operatios.
     // Prepare qr
     Tensor qr;
-    qr.mutable_data<phi::funcs::Real<T>>(
+    qr.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * m * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * m * n * sizeof(phi::dtype::Real<T>)));
     // BatchedGeqrf performs computation in-place and 'qr' must be a copy of
     // input
     paddle::framework::TensorCopy(x, context.GetPlace(), &qr);
@@ -126,7 +126,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
           for (int i = 0; i < batch_size; ++i) {
             memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride),
                          dev_ctx.GetPlace(), (qr_data + i * qr_stride),
-                         qr_stride * sizeof(phi::funcs::Real<T>),
+                         qr_stride * sizeof(phi::dtype::Real<T>),
                          dev_ctx.stream());
           }
           BatchedOrgqr<platform::CUDADeviceContext, T>(
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index cef9371fea099627fd4280f166f013bc84507372..f09a07e96cd34e1b631ef9484fe23b12a3b58543 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -74,19 +74,19 @@ class QrCPUKernel : public framework::OpKernel<T> {
     int q_stride = m * k;
     int r_stride = k * n;
 
-    auto* x_data = x.data<phi::funcs::Real<T>>();
+    auto* x_data = x.data<phi::dtype::Real<T>>();
     T* q_data = nullptr;
     if (compute_q) {
-      q_data = q.mutable_data<phi::funcs::Real<T>>(
+      q_data = q.mutable_data<phi::dtype::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
       memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
     }
-    auto* r_data = r.mutable_data<phi::funcs::Real<T>>(
+    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
+    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     // Implement QR by calling Eigen
     for (int i = 0; i < batch_size; ++i) {
@@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     // Use a different name dA instead of dX
     framework::Tensor& dA =
         *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dA.mutable_data<phi::funcs::Real<T>>(ctx.GetPlace());
+    dA.mutable_data<phi::dtype::Real<T>>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
 
@@ -224,7 +224,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     } else {
       // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V]
       // Calculate dX and dY individually and concatenate them to get dA
-      dA.mutable_data<phi::funcs::Real<T>>(ctx.GetPlace());
+      dA.mutable_data<phi::dtype::Real<T>>(ctx.GetPlace());
 
       auto Y = dito.Slice(A, {-1}, {m}, {n});
       auto U = dito.Slice(R, {-1}, {0}, {m});
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 24741efe426b18b7cecae9332c522d67aee98d63..c7e91ba35dee1356ddd71ade0fe9892f8032c77b 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 21c23a7f602a35acf676e97a9134c2c43a73126c..4b6759ea165edf29add66ee44461fdd4d9f84d00 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -70,9 +70,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  if (platform::is_mlu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::MluEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
+  mlu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::NPUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+    if (platform::is_mlu_place(place_)) {
+      TensorVec &mlu = mlu_buffer_[i];
+      if (mlu.empty()) {
+        mlu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mlu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on MLU and CPU devices are not matched. "
+                "The number on MLU is %d, on CPU is %d",
+                mlu.size(), cpu.size()));
+      }
+
+      std::vector<void *> mlu_ptrs;
+      mlu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        mlu[i].Resize(cpu[i].dims());
+        mlu[i].set_layout(cpu[i].layout());
+        mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetMLUDeviceId(place_.device);
+      PADDLE_ENFORCE_MLU_SUCCESS(
+          cnPlaceNotifier(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto mlu_ptr = mlu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        if ((platform::is_mlu_place(cpu_place))) {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+        } else {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+          platform::MLUStreamSync(stream_.get());
+        }
+        mlu[i].set_lod(cpu[i].lod());
+      }
+      platform::MLUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(cuda_buffer_[i]);
   } else if (platform::is_npu_place(place_)) {
     *out = std::move(npu_buffer_[i]);
+  } else if (platform::is_mlu_place(place_)) {
+    *out = std::move(mlu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 3d42486c6df8815aaab8e55e29898700bb74d953..f0f3b6b7f9fdfeb69c46e7122fae5c6cfbf3a169 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -29,6 +29,11 @@
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
+  std::vector<TensorVec> mlu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::NpuStreamObject> stream_;
   std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  mluStream compute_stream_;
+  std::shared_ptr<platform::MluStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::MluEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 1f3691978b577e2023eb4f784f2327752855b9b7..18e444702fbb2cc19912a32587f96330e6e8632d 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
index e8e4ff7010d3df01cda514d51796b789ef5e1da6..a724524716be39e554c6046ca809624b7fbb053a 100644
--- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
+++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
@@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) {
       }
 
       if (is_valid) {
-        phi::kernels::details::CheckReduceRank(reduce_rank, rank);
+        phi::funcs::details::CheckReduceRank(reduce_rank, rank);
       } else {
-        ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank),
+        ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank),
                      paddle::platform::EnforceNotMet);
       }
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index cb438b4a8057267015c8b3c15dd8468fca5a4b44..41df8e4a15f093a40a31c70eea98dfb7e575f4cd 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -14,15 +14,28 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_max);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MaxFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMaxOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_max"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_max"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_max, ops::ReduceOp, ReduceMaxOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMaxInferShapeFunctor);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
+
 REGISTER_OP_CPU_KERNEL(
     reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                            float, ops::MaxOrMinGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
deleted file mode 100644
index 8194805ddc3736b365667883447cc13d7b729494..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-// reduce_max
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max,
-    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
index 7e02f0268b5e510ac8262543db58ee98ef20e517..1abec24c0d3ef9dc42739b90f775566a8737b852 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -27,11 +27,11 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
     int out_dtype = context.Attr<int>("out_dtype");
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = framework::vectorize(input->dims());
+    auto input_dims = input->dims();
     const auto& input_dim_size = input->dims().size();
     std::vector<int> reduce_dims;
     if (reduce_all) {
-      for (size_t i = 0; i < input_dims.size(); i++) {
+      for (int i = 0; i < input_dims.size(); i++) {
         reduce_dims.push_back(static_cast<int>(i));
       }
     } else {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index e80df5f95bb4ab33a6c08cc646d0ef8311e38936..4a18330913803f822436118a35fb957b7e31b391 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -18,6 +18,10 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -92,9 +96,13 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
 };
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
 REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
                   ops::ReduceMeanOpGradMaker<paddle::framework::OpDesc>,
-                  ops::ReduceMeanOpGradMaker<paddle::imperative::OpBase>);
+                  ops::ReduceMeanOpGradMaker<paddle::imperative::OpBase>,
+                  ReduceMeanInferShapeFunctor);
 REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
index daf5965fd54628a097ad1d53057ec54b9a5d329a..d80cce742210f1fb7ca6cda977e9f5b455f1a84b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
@@ -27,11 +27,11 @@ class ReduceMinMLUKernel : public framework::OpKernel<T> {
     int out_dtype = context.Attr<int>("out_dtype");
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = framework::vectorize(input->dims());
+    auto input_dims = input->dims();
     const auto& input_dim_size = input->dims().size();
     std::vector<int> reduce_dims;
     if (reduce_all) {
-      for (size_t i = 0; i < input_dims.size(); i++) {
+      for (int i = 0; i < input_dims.size(); i++) {
         reduce_dims.push_back(static_cast<int>(i));
       }
     } else {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 3aab906804f7adb95f80aa2675f01217b0b48d39..160617695338a9f2e140b7b418c93ef0d7c57e17 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -23,8 +23,7 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 namespace paddle {
 namespace operators {
 
@@ -37,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
                       gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
-  phi::kernels::TensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+  phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
       static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
-      origin_reduce_dims, stream);
+      origin_reduce_dims);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 50df75d9ad3fd78ece196e5b7cc76eafe42e1d2d..eb745ab9c56c5b3cfa62eb36713ebc2485282d6d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -27,15 +27,7 @@ class CPUDeviceContext;
 }  // namespace paddle
 
 REGISTER_REDUCE_OP(reduce_prod);
-REGISTER_OP_CPU_KERNEL(reduce_prod,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::ProdFunctor>);
+
 REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
                        ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                              float, ops::ProdGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
index 103e108e4bda1c33434ec0c5d6c58f24fa725f57..60dedf8d6ffb0706f8ec9ac2130b6b51067df918 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
@@ -19,13 +19,6 @@
 namespace paddle {
 namespace operators {
 
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
 struct ProdGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index bdab14a18a05ab3e0df1dbda57f3753033cfacb4..2a78774f3706e73bd8931e80fe020faac58d7ff5 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -16,6 +16,10 @@
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -98,24 +102,15 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_sum"; }
 };
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
+                            PD_INFER_META(phi::SumRawInferMeta));
+
 REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumVarTypeInference,
                   ops::ReduceSumOpGradMaker<paddle::framework::OpDesc>,
-                  ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>);
+                  ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>,
+                  ReduceSumInferShapeFunctor);
 REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceSumGradKernel =
-    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
-                             ops::SumGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum_grad, CPUReduceSumGradKernel<bool>,
-    CPUReduceSumGradKernel<float>, CPUReduceSumGradKernel<double>,
-    CPUReduceSumGradKernel<paddle::platform::float16>,
-    CPUReduceSumGradKernel<int>, CPUReduceSumGradKernel<int64_t>,
-    CPUReduceSumGradKernel<paddle::platform::complex<float>>,
-    CPUReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
deleted file mode 100644
index c3d3e0cf6ecd51f3bb2baa063878f80444db3563..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-template <typename T>
-using CUDAReduceSumGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::IdentityFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
-    CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
-    CUDAReduceSumGradKernel<paddle::platform::float16>,
-    CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index c18570af775cc88d7c54de7899d7359f791b8b08..a473b54c1f855945a5f3f0ac8d0826b15494ba1a 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -16,17 +16,17 @@ limitations under the License. */
 #include <type_traits>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/unique_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 using TensorList = std::vector<framework::Tensor>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)                      \
   inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \
     const std::string& mode = ctx.Attr<std::string>("mode");           \
@@ -100,7 +108,7 @@ struct Cell {
 };
 
 template <typename T, template <typename> class EigenActivationFunctor,
-          math::detail::ActivationType act_type>
+          phi::funcs::detail::ActivationType act_type>
 struct SimpleRNNCell : Cell<T> {
   void operator()(const platform::CPUDeviceContext* device_ctx, Tensor* input,
                   const Tensor* weight_hh, const Tensor* init_h,
@@ -148,7 +156,7 @@ struct GRUCell : Cell<T> {
     size_t frame_size = init_h->dims()[2];
     size_t batch_size = init_h->dims()[1];
 
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = weight_hh->data<T>();
     gru_value.state_weight = weight_hh->data<T>() + 2 * frame_size * frame_size;
     gru_value.reset_bias = bias_hh->data<T>() + 2 * frame_size;
@@ -158,10 +166,10 @@ struct GRUCell : Cell<T> {
     gru_value.output_value = output->data<T>();
     gru_value.prev_out_value = init_h->data<T>();
 
-    auto gate_act = math::detail::GetActivationType("sigmoid_v2");
-    auto cand_act = math::detail::GetActivationType("tanh_v2");
+    auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2");
 
-    math::GRUUnitFunctorV2<platform::CPUDeviceContext, T>::compute(
+    phi::funcs::GRUUnitFunctorV2<platform::CPUDeviceContext, T>::compute(
         *device_ctx, gru_value, frame_size, batch_size, cand_act, gate_act);
   }
 };
@@ -184,14 +192,14 @@ struct LSTMCell : Cell<T> {
     blas.MatMul(*init_h, mat_dim_a, *weight_hh, mat_dim_b, static_cast<T>(1.0),
                 input, static_cast<T>(1.0));
 
-    math::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaValue<T> lstm_value;
     lstm_value.check_ig = nullptr;
     lstm_value.check_fg = nullptr;
     lstm_value.check_og = nullptr;
 
-    auto gate_act = math::detail::GetActivationType("sigmoid_v2");
-    auto cell_act = math::detail::GetActivationType("tanh_v2");
-    auto cand_act = math::detail::GetActivationType("tanh_v2");
+    auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto cell_act = phi::funcs::detail::GetActivationType("tanh_v2");
+    auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2");
 
     size_t frame_size = init_h->dims()[2];
     size_t batch_size = init_h->dims()[1];
@@ -208,7 +216,7 @@ struct LSTMCell : Cell<T> {
     lstm_value.state_value = last_c->data<T>();
     lstm_value.state_active_value = last_c_act->data<T>();
     T cell_clip = 0.0;
-    math::LstmUnitFunctor<platform::CPUDeviceContext, T>::compute(
+    phi::funcs::LstmUnitFunctor<platform::CPUDeviceContext, T>::compute(
         *device_ctx, lstm_value, frame_size, batch_size, cell_clip, gate_act,
         cell_act, cand_act, false);
   }
@@ -986,18 +994,18 @@ class RNNCPUKernel : public framework::OpKernel<T> {
           seed, reserve_data);
     } else if (is_rnn_relu(ctx)) {
       gate_num = 1;
-      RnnFunc<
-          SimpleRNNCell<T, ReluCPUFunctor, math::detail::ActivationType::kReLU>,
-          Layer, SingleLayer, BidirLayer, T>(
+      RnnFunc<SimpleRNNCell<T, ReluCPUFunctor,
+                            phi::funcs::detail::ActivationType::kReLU>,
+              Layer, SingleLayer, BidirLayer, T>(
           ctx, input, weight_list, pre_state[0], nullptr, sequence_length,
           state[0], nullptr, output, dropout_mask, num_layers, gate_num,
           input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test,
           seed, reserve_data);
     } else if (is_rnn_tanh(ctx)) {
       gate_num = 1;
-      RnnFunc<
-          SimpleRNNCell<T, TanhFunctor, math::detail::ActivationType::kTanhV2>,
-          Layer, SingleLayer, BidirLayer, T>(
+      RnnFunc<SimpleRNNCell<T, TanhFunctor,
+                            phi::funcs::detail::ActivationType::kTanhV2>,
+              Layer, SingleLayer, BidirLayer, T>(
           ctx, input, weight_list, pre_state[0], nullptr, sequence_length,
           state[0], nullptr, output, dropout_mask, num_layers, gate_num,
           input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test,
@@ -1014,14 +1022,14 @@ class RNNCPUKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-void create_lstm_value(math::LstmMetaValue<T>* lstm_value) {
+void create_lstm_value(phi::funcs::LstmMetaValue<T>* lstm_value) {
   lstm_value->check_ig = nullptr;
   lstm_value->check_fg = nullptr;
   lstm_value->check_og = nullptr;
 }
 
 template <typename T>
-void create_lstm_grad(math::LstmMetaGrad<T>* lstm_grad) {
+void create_lstm_grad(phi::funcs::LstmMetaGrad<T>* lstm_grad) {
   lstm_grad->check_ig_grad = nullptr;
   lstm_grad->check_fg_grad = nullptr;
   lstm_grad->check_og_grad = nullptr;
@@ -1686,8 +1694,8 @@ struct GRUGradCell : GradCell<T> {
     // zero pre_hidden
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, grad_pre_hidden, static_cast<T>(0.0));
-    math::GRUMetaValue<T> gru_value;
-    math::GRUMetaGrad<T> gru_grad;
+    phi::funcs::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaGrad<T> gru_grad;
     gru_value.gate_value = gate_tensor->data<T>();
     gru_value.prev_out_value = pre_hidden->data<T>();
     gru_value.reset_output_value = state_tensor->data<T>();
@@ -1703,9 +1711,9 @@ struct GRUGradCell : GradCell<T> {
         grad_weight_hh->data<T>() + 2 * frame_size * frame_size;
     gru_grad.bias_hh_grad = grad_bias_hh->data<T>();
 
-    auto act_gate = math::detail::GetActivationType("sigmoid_v2");
-    auto act_node = math::detail::GetActivationType("tanh_v2");
-    math::GRUUnitGradFunctorV2<platform::CPUDeviceContext, T>::compute(
+    auto act_gate = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto act_node = phi::funcs::detail::GetActivationType("tanh_v2");
+    phi::funcs::GRUUnitGradFunctorV2<platform::CPUDeviceContext, T>::compute(
         device_ctx, gru_value, gru_grad, frame_size, batch_size, act_node,
         act_gate);
 
@@ -1738,8 +1746,8 @@ struct LSTMGradCell : GradCell<T> {
       backup_tensor<T>(context, &grad_pre_state_bak, grad_pre_state);
     }
 
-    math::LstmMetaValue<T> lstm_value;
-    math::LstmMetaGrad<T> lstm_grad;
+    phi::funcs::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaGrad<T> lstm_grad;
     create_lstm_value(&lstm_value);
     create_lstm_grad(&lstm_grad);
     lstm_value.gate_value = gate_tensor->data<T>();
@@ -1755,12 +1763,12 @@ struct LSTMGradCell : GradCell<T> {
     lstm_value.output_value = nullptr;
     lstm_grad.state_active_grad = nullptr;
 
-    auto gate_act = math::detail::GetActivationType("sigmoid_v2");
-    auto state_act = math::detail::GetActivationType("tanh_v2");
-    auto cand_act = math::detail::GetActivationType("tanh_v2");
+    auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto state_act = phi::funcs::detail::GetActivationType("tanh_v2");
+    auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2");
 
     T cell_clip = 0.0;
-    math::LstmUnitGradFunctor<platform::CPUDeviceContext, T>::compute(
+    phi::funcs::LstmUnitGradFunctor<platform::CPUDeviceContext, T>::compute(
         device_ctx, lstm_value, lstm_grad, frame_size, batch_size, cell_clip,
         gate_act, state_act, cand_act, false);
     this->update_pre_hidden_grad(
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 6da73c99068bc0e0453dfdd1b5eca8e1add1954b..7fe6623dcca14afc8fafc4875ccfb7546e4456f0 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -38,7 +38,8 @@ class SaveCombineOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
-    return expected_kernel_type;
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place());
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index e4410b21b541320c1d39c3ad155dfce6f74b7dc2..cbf2b9152079e13acd4a221ece402b946b844999 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index bb02bb541e14f551bb749c890877e4753d225c3c..0ae0e1500c16627fc269b31c57b25c47055d7d34 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,73 +27,6 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Index"), true,
-        platform::errors::InvalidArgument(
-            "Input(Index) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Updates"), true,
-        platform::errors::InvalidArgument(
-            "Input(Updates) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of ScatterNdAddOp should not be null."));
-
-    auto ref_dims = ctx->GetInputDim("X");
-    auto ref_dims_size = ref_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto updates_dims_size = updates_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], ref_dims_size,
-        platform::errors::InvalidArgument(
-            "The last dimension of Input(Index)'s shape should be no greater "
-            "than the rank of Input(X), but received the last dimension of "
-            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
-            index_dims[index_dims_size - 1], ref_dims_size));
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1, "
-                          "but received the rank of Input(Index) is %d.",
-                          index_dims_size));
-
-    // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
-    std::vector<int64_t> r_updates_dims;
-    for (int64_t i = 0; i < index_dims_size - 1; ++i) {
-      r_updates_dims.emplace_back(index_dims[i]);
-    }
-    for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
-      r_updates_dims.emplace_back(ref_dims[i]);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument(
-            "Updates has wrong shape. The shape of Updates and Input(Updates) "
-            "should be same, but received the shape of Updates is %d, "
-            "the shape of Input(Updates) is %d.",
-            r_updates_dims.size(), updates_dims_size));
-
-    for (int64_t i = 0; i < updates_dims_size; ++i) {
-      PADDLE_ENFORCE_EQ(
-          r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument(
-              "Updates has wrong shape. The dimensions of Updates and "
-              "Input(Updates) should match, but received Updates's"
-              "%d-th dimension is %d, Input(Updates)'s %d-th "
-              "dimension is %d.",
-              i, r_updates_dims[i], i, updates_dims[i]));
-    }
-    ctx->SetOutputDim("Out", ref_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,7 +35,8 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
                       platform::errors::InvalidArgument(
                           "Ref and Updates must have same type"));
     return framework::OpKernelType(
-        framework::TransToProtoVarType(ctx.Input<Tensor>("X")->type()),
+        framework::TransToProtoVarType(
+            ctx.Input<framework::Tensor>("X")->type()),
         ctx.device_context());
   }
 };
@@ -108,17 +45,6 @@ class ScatterNdAddGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -193,22 +119,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterNdAddInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
+                            ScatterNdAddGradInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterNdAddGradInferMeta));
+
 REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker,
                   ops::ScatterNdAddGradMaker<paddle::framework::OpDesc>,
-                  ops::ScatterNdAddGradMaker<paddle::imperative::OpBase>);
+                  ops::ScatterNdAddGradMaker<paddle::imperative::OpBase>,
+                  ScatterNdAddInferShapeFunctor);
 
 REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp,
-                  ops::ScatterNdAddGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel<float>,
-                       ops::ScatterNdAddOpKernel<double>,
-                       ops::ScatterNdAddOpKernel<int64_t>,
-                       ops::ScatterNdAddOpKernel<int>,
-                       ops::ScatterNdAddOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad,
-                       ops::ScatterNdAddGradientOpKernel<float>,
-                       ops::ScatterNdAddGradientOpKernel<double>,
-                       ops::ScatterNdAddGradientOpKernel<int64_t>,
-                       ops::ScatterNdAddGradientOpKernel<int>,
-                       ops::ScatterNdAddGradientOpKernel<uint8_t>);
+                  ops::ScatterNdAddGradNoNeedBufferVarsInferer,
+                  ScatterNdAddGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
deleted file mode 100644
index 6448f8cc4056d2c11806c1c342df57d597e606ba..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int32_t>(ctx, *Updates, *Ids, Out);
-    } else {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGatherNd<DeviceContext, T, int32_t>(ctx, *dOut, *Ids, dUpdates);
-      } else {
-        GPUGatherNd<DeviceContext, T, int64_t>(ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
deleted file mode 100644
index 2bdf9ec58a850ea59f7f0697bc5d0eadde0adc99..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterNdAddOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    // In place output: Out = X
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
-    } else {
-      ScatterNdAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename T>
-class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGatherNd<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      } else {
-        CPUGatherNd<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3174f07e96e227c8a2f1103d3d6664673c7a2d56..5f6b04cf59e0e3c8c05d44ad6c4a3321ff2516e4 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,46 +26,6 @@ class ScatterOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Updates) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of ScatterOp should not be null."));
-
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of Input(Ids)'s shape should be equal to 1, but "
-            "received the rank of Input(Ids) is %d.",
-            ctx->GetInputDim("Ids").size()));
-    PADDLE_ENFORCE_EQ(
-        ref_dims.size(), updates_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Updates) should have the same shape size, "
-            "but received the size of Input(x)'s shape is %d, the size of "
-            "Input(Updates)'s shape is %d.",
-            ref_dims.size(), updates_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
-        platform::errors::InvalidArgument(
-            "Input(Updates) and Input(Ids) should have same batch-size, but"
-            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
-            "batch-size is %d.",
-            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
-    ctx->SetOutputDim("Out", ref_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -76,17 +39,6 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -151,17 +103,17 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterGradInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
                   ops::ScatterGradMaker<paddle::framework::OpDesc>,
                   ops::ScatterGradMaker<paddle::imperative::OpBase>,
-                  ops::ScatterInplaceInferer);
+                  ops::ScatterInplaceInferer, ScatterInferShapeFunctor);
 REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
-                  ops::ScatterGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>,
-                       ops::ScatterOpKernel<double>, ops::ScatterOpKernel<int>,
-                       ops::ScatterOpKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>,
-                       ops::ScatterGradientOpKernel<double>,
-                       ops::ScatterGradientOpKernel<int>,
-                       ops::ScatterGradientOpKernel<int64_t>);
+                  ops::ScatterGradNoNeedBufferVarsInferer,
+                  ScatterGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
deleted file mode 100644
index 549e30803b4647e3e107b0d16147c472c0dcb226..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.cu
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-#include "paddle/fluid/operators/scatter_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    framework::TensorCopy(*X, ctx.GetPlace(), Out);
-    // use template class to support int32_t and int64_t
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int32_t>(ctx, *Updates, *Ids, Out, overwrite);
-    } else {
-      GPUScatterAssign<T, int64_t>(ctx, *Updates, *Ids, Out, overwrite);
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
-      } else {
-        GPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
-      }
-    }
-
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      } else {
-        GPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>,
-                        ops::ScatterOpCUDAKernel<double>,
-                        ops::ScatterOpCUDAKernel<int>,
-                        ops::ScatterOpCUDAKernel<int64_t>,
-                        ops::ScatterOpCUDAKernel<paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    scatter_grad, ops::ScatterGradOpCUDAKernel<float>,
-    ops::ScatterGradOpCUDAKernel<double>, ops::ScatterOpCUDAKernel<int>,
-    ops::ScatterOpCUDAKernel<int64_t>,
-    ops::ScatterGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
deleted file mode 100644
index 69ab6c7135cd55468bbe8a4c65d45a466b8eaa75..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    double overwrite = ctx.Attr<bool>("overwrite");
-
-    // In place output: Out = X, Out[Ids] = Updates
-    framework::TensorCopy(*X, ctx.GetPlace(), Out);
-    // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
-      } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
-      }
-    } else {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
-      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
-      }
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
-      } else {
-        CPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
-      }
-    }
-
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      } else {
-        CPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index fa5f03a092882ec1f63e9556bc38d94ed40c9a7f..815984ac307fdce14a64f01a661b4b7f7ce1d616 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/operators/scatter_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
index 9f0b74e8a3f80c5c8a22c2db109f75e6ee316be1..07dd2f2d85fe9ac330be1f85d283c85207b1b78c 100644
--- a/paddle/fluid/operators/scatter_op_xpu.cc
+++ b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index 0a4cab5fac1abe92b2b2457098d71a7dc3624910..93f2d60e5f232767f8e604ca98e3c39fc00caf8b 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 #include <gtest/gtest.h>
 
@@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
+  phi::funcs::ScatterAssign<float>(ctx, src, index, &output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 322cd97f01c3ad97ba74f049696fdec592ee524e..9d4c8532a82c064b1b7aef759934ad8dad894ec5 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/segment_pool_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
-                   "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
-    auto dims = ctx->GetInputDim("X");
-    dims[0] = -1;
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
-                     "SegmentPool");
-      ctx->SetOutputDim("SummedIds", {-1, 1});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor,
+                            PD_INFER_META(phi::SegmentPoolInferMeta));
+
 REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
                   ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>,
+                  SegmentPoolInferShapeFunctor);
 REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
deleted file mode 100644
index 4e20844dc3275f840ff93029abb222e2ef02e0fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
deleted file mode 100644
index 2f5ef7f54f988884a25feba4665283d3ce260988..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/segment_pool_op.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename IndexT>
-void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
-  auto* input = context.Input<Tensor>("X");
-  auto* segment = context.Input<Tensor>("SegmentIds");
-  auto* output = context.Output<Tensor>("Out");
-  std::string pooltype = context.Attr<std::string>("pooltype");
-  Tensor* summed_ids = nullptr;
-
-  int64_t num_indices = segment->numel();
-  PADDLE_ENFORCE_EQ(
-      num_indices, input->dims()[0],
-      platform::errors::InvalidArgument(
-          "Segment_ids should be the same size as dimension 0 of input X."));
-  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
-                    platform::errors::InvalidArgument(
-                        "Segment_ids should be 1-D tensor, or it's other "
-                        "dimension size is 1. Segment_ids's shape is: [%s].",
-                        segment->dims()));
-
-  if (input->numel() == 0 || segment->numel() == 0) {
-    return;
-  }
-
-  bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU;
-  if (cpu_place) {
-    auto dims = input->dims();
-    auto* segment_ids = segment->data<IndexT>();
-    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
-    PADDLE_ENFORCE_GT(
-        dims[0], 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", dims[0]));
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, output, static_cast<T>(0));
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (!cpu_place) {
-    Tensor length;
-    length.mutable_data<IndexT>(phi::make_ddim({1}), platform::CPUPlace());
-    IndexT* length_data = length.data<IndexT>();
-    const IndexT* segment_ids = segment->data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                  hipMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                   cudaMemcpyDeviceToHost));
-#endif
-
-    IndexT length_host = length_data[0];
-    length_host++;
-    PADDLE_ENFORCE_GT(
-        length_host, 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", length_data[0]));
-    auto dims = input->dims();
-    dims[0] = static_cast<int64_t>(length_host);
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    T init_value = 0;
-    if (pooltype == "MAX") {
-      init_value = static_cast<T>(-FLT_MAX);
-    } else if (pooltype == "MIN") {
-      init_value = static_cast<T>(FLT_MAX);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> setconst;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    setconst(dev_ctx, output, static_cast<T>(init_value));
-    // the gpu kernel of mean pool record the counts of segment_ids
-    if (pooltype == "MEAN") {
-      summed_ids = context.Output<Tensor>("SummedIds");
-      summed_ids->Resize({dims[0], 1});
-      summed_ids->mutable_data<T>(context.GetPlace());
-      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
-    }
-  }
-#endif
-
-  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
-
-  pool(context.template device_context<DeviceContext>(), *input, *segment,
-       output, summed_ids, pooltype);
-}
-
-template <typename DeviceContext, typename T>
-class SegmentPoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SegmentPoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Input<Tensor>("Out");
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    const Tensor* summed_ids = nullptr;
-    if (pooltype == "MEAN") {
-      summed_ids = context.Input<Tensor>("SummedIds");
-    }
-
-    in_g->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 0adf61d7ce3e5b5792b9dc65d5ac8f884dc81ea5..59c6e16535738ba6cbb3224dd4ff5c2987618cdf 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/selu_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -30,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel {
          const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    return UnaryOpUnchangedInferShape(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -123,13 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
                   ops::SeluGradMaker<paddle::framework::OpDesc>,
-                  ops::SeluGradMaker<paddle::imperative::OpBase>);
+                  ops::SeluGradMaker<paddle::imperative::OpBase>,
+                  SeluInferShapeFunctor);
+
 REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    selu, ops::SeluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SeluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    selu_grad, ops::SeluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SeluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/selu_op.cu b/paddle/fluid/operators/selu_op.cu
deleted file mode 100644
index fb3245ab7609ea9067709134a3713e9871dbb4d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/selu_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/selu_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    selu, ops::SeluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SeluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    selu_grad, ops::SeluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SeluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h
deleted file mode 100644
index b2fc834c42f65ff3521b6267ed2f32fabbab4e4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/selu_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct SeluFunctor {
-  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
-      : x_data_ptr_(x_data_ptr),
-        alpha_(alpha),
-        scale_(scale),
-        y_data_ptr_(y_data_ptr) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T x_ele = x_data_ptr_[idx];
-    if (x_ele <= 0) {
-      x_ele = alpha_ * real_exp(x_ele) - alpha_;
-    }
-    y_data_ptr_[idx] = scale_ * x_ele;
-  }
-  const T* x_data_ptr_;
-  const float alpha_;
-  const float scale_;
-  T* y_data_ptr_;
-};
-
-template <typename T>
-struct SeluGradFunctor {
-  SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha,
-                  float scale, T* dx_data_ptr)
-      : y_data_ptr_(y_data_ptr),
-        dy_data_ptr_(dy_data_ptr),
-        alpha_(alpha),
-        scale_(scale),
-        la_(alpha * scale),
-        dx_data_ptr_(dx_data_ptr) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T y_ele = y_data_ptr_[idx];
-    T dy_ele = dy_data_ptr_[idx];
-
-    float tmp = scale_;
-    if (y_ele <= 0) {
-      tmp = y_ele + la_;
-    }
-    dx_data_ptr_[idx] = dy_ele * tmp;
-  }
-  const T* y_data_ptr_;
-  const T* dy_data_ptr_;
-  const float alpha_;
-  const float scale_;
-  const float la_;
-  T* dx_data_ptr_;
-};
-
-template <typename DeviceContext, typename T>
-class SeluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-
-    float alpha = context.Attr<float>("alpha");
-    float scale = context.Attr<float>("scale");
-
-    auto out_ptr = out->mutable_data<T>(context.GetPlace());
-
-    SeluFunctor<T> functor(x->data<T>(), alpha, scale, out_ptr);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(x->numel());
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SeluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-
-    auto* out = context.Input<Tensor>("Out");
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-
-    float alpha = context.Attr<float>("alpha");
-    float scale = context.Attr<float>("scale");
-
-    auto dx_ptr = dx->mutable_data<T>(context.GetPlace());
-
-    SeluGradFunctor<T> functor(out->data<T>(), dout->data<T>(), alpha, scale,
-                               dx_ptr);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(out->numel());
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 6c33ff52044b26b598f835ee40462a01077c1ff8..23c6a0133e1edafba5621825db78a52b88e6947a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
         col_data, paddle::platform::errors::Fatal("XPU memory is not enough"));
 
     if (in_g || filter_g) {
-      int r = xpu::constant<T>(xpu_context, col_data, col_numel, T(0));
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
       bool trans_a = false;
       bool trans_b = true;
       int m = out_g->dims()[0];
@@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
       const T* data_b = filter->data<T>();
       T* data_c = col_data;
 
-      r = xpu::fc_fusion<T, T, T, int32_t>(
+      int r = xpu::fc_fusion<T, T, T, int32_t>(
           xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b,
           nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr,
           xpu::Activation_t::LINEAR);
@@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      xpu::constant<T>(xpu_context, in_g->data<T>(), in_g->numel(), T(0));
 
       int r = xpu::sequence_context_projection_grad<T, int>(
           xpu_context, in_g->data<T>(), col_data, nullptr, lodx, sequence_width,
@@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      xpu::constant<T>(xpu_context, filter_g->data<T>(), filter_g->numel(),
-                       T(0));
 
       int r = xpu::sequence_context_projection<T, int>(
           xpu_context, in->data<T>(), col_data, nullptr, lodx, sequence_width,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 2d4730635fd2aeb2e20aa5f4a637f94bce075566..25c12ab565a141f48d254d51bfca64f7422f1f42 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
index 365381abc4683580b9dffb94ace9876933de495b..2960b77d5ac0f81e4dd026d9de3448cac1459645 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index ec3e04e71faf0b20950d87de1a7f066e2e49310a..7d0d782b837c4c828996e993634373ab38d88eac 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
                   ops::SetValueOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu
index f9701b0acaac769bd91bbba156a010c2e05e42c3..9f291a863c067ae0210f44befb89191678291441 100644
--- a/paddle/fluid/operators/set_value_op.cu
+++ b/paddle/fluid/operators/set_value_op.cu
@@ -16,13 +16,6 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, bool>);
-
 REGISTER_OP_CUDA_KERNEL(
     set_value_grad,
     ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 9dd727959202c6b09bad0f07aa242a8897583342..4d459f8c01b159549c331f9332e49ed79e7c9b16 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -121,201 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       "of target shape: %d, but now shape is %d.",
       second.to_str(), first.to_str()));
 }
-
-template <typename DeviceContext, typename T>
-class SetValueKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const int rank = ctx.Input<framework::LoDTensor>("Input")->dims().size();
-
-    // TODO(liym27): A more elegent code to do this. C++ has to make template
-    //  integer as constant, but we had better have alternative writing in the
-    //  future.
-    switch (rank) {
-      case 1:
-        SetValueCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.", rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueCompute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::LoDTensor>("Input");
-    auto* value_tensor = ctx.Input<framework::LoDTensor>("ValueTensor");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    auto starts_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<framework::Tensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StepsTensorList");
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-
-    auto in_dims = in->dims();
-    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
-
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-
-    auto place = ctx.GetPlace();
-    auto& eigen_place =
-        *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // Here copy data from input to avoid data loss at PE and Graph level.
-    // TODO(liym27): Speed up in the future version.
-    // - Q: Why don't call ShareDataWith to speed up?
-    // - A: Because it's not supported to ShareDataWith on OP's input and output
-    // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
-    // - Q: Why don't delete Input, after all, the input and output are the same
-    // Tensor at program level?
-    // - A: If deleting Input, the graph will be complex, such as there will
-    // be two ops points to the output in graph: op1 -> output <- set_value.
-    // In this case, we have to find a way to handle the running order of
-    // set_value is what we want.
-    paddle::framework::TensorCopy(*in, place, out);
-
-    Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype());
-    slice_tensor.mutable_data<T>(slice_dims, place);
-    pad_tensor.mutable_data<T>(in_dims, place);
-
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
-    auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
-
-    // Step 1: Set the value of out at `_index` to zero
-    slice_e.device(eigen_place) = slice_e.constant(T(0));
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-    for (size_t i = 0; i < D; ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-      if (starts[i] == ends[i]) {  // slice is empty, data will not be changed
-        return;
-      }
-    }
-
-    out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 2: Set a tensor with the same shape as out tensor. And its data at
-    // '_index' is the same as value_tensor, and data out of '_index' to zero
-
-    // - Step 2.1 Set slice tensor with value
-
-    // NOTE(liym27): [ Why resize slice_tensor here? ]
-    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
-    // slice_tensor should be decreased dims.
-    // e.g.
-    //  x[:,0] = value_tensor
-    // x's shape = [3, 4], value_tensor's shape = [3]
-    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
-    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
-    // shape is [3, 3], which cross the border;
-    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
-    // is [3], which is right.
-
-    slice_tensor.Resize(slice_dims_for_assign);
-    if (value_tensor != nullptr) {
-      CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
-      // ElementwiseComputeEx can do broadcasting
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
-    } else {
-      Tensor value_t(in->dtype());
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-      value_t.mutable_data<T>(value_dims, place);
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
-    }
-    slice_tensor.Resize(slice_dims);
-
-    // - Step 2.2 Pad slice tensor with 0
-    pad_e.device(eigen_place) = pad_e.constant(T(0));
-    pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 3: Set out tensor with value_tensor
-    out_e.device(eigen_place) = out_e - pad_e;
-  }
-};
-
 template <typename DeviceContext, typename T>
 class SetValueGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 599697059c4dcfa54fa728a8ebf88ad95f387774..46d64333b608b7f3e7b3d83664978d162b6d6e52 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(index_indices))
         .AddInput(val_temp)
         .AddOutput(out_temp)
+#if (CANN_VERSION_CODE >= 504001)
+        .AddAttrs({{"use_locking", false}})
+#endif
         .Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 5b7ccdde81097a2cfd74c3d65c0679d277b766a3..e2c8359beb1290f7b1b592c1ff24b15986f41f73 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -95,9 +93,3 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int8_t>, ops::ShapeKernel<uint8_t>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>,
-                       ops::ShapeKernel<plat::complex<float>>,
-                       ops::ShapeKernel<plat::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
deleted file mode 100644
index c6e380a94f84db7de53d0c218682813fcad0128d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-#include "paddle/fluid/platform/complex.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<bool>,
-    paddle::operators::ShapeKernel<int>, paddle::operators::ShapeKernel<int8_t>,
-    paddle::operators::ShapeKernel<uint8_t>,
-    paddle::operators::ShapeKernel<int64_t>,
-    paddle::operators::ShapeKernel<float>,
-    paddle::operators::ShapeKernel<double>,
-    paddle::operators::ShapeKernel<paddle::platform::float16>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<float>>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
deleted file mode 100644
index 39ebcca46a710e0b817792105046af70b6298fc1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-class ShapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<LoDTensor>().dims();
-    }
-    auto* out_t = ctx.Output<Tensor>("Out");
-    out_t->Resize({in_dims.size()});
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 7bff7b2d668347692309d3695eb46b1fbdb6c7dd..f751ab41014c21fda2403bd69bcd20ad549e40c7 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index 2e9092a643253843ed09ab7475ec3ed723d5e3b8..a62d1b434e76434c3710e45e723060d3f452c91c 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -10,12 +10,41 @@
  *     limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
 
-#include "paddle/fluid/operators/shape_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
+
+template <typename T>
+class ShapeXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel<bool>,
+                       ops::ShapeXPUKernel<int>, ops::ShapeXPUKernel<int64_t>,
+                       ops::ShapeXPUKernel<float>, ops::ShapeXPUKernel<double>);
 
 #endif
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index 54555e494ffe5f2c226c7aabd47b4ce991dab2ec..053a90f2fc9fa2f93c2647c420a046401198bc28 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,27 +23,6 @@ namespace operators {
 class ShardIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 2, "
-                          "but the value given is %d.",
-                          x_dims.size()));
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U,
-                        platform::errors::InvalidArgument(
-                            "The last dimension of Input(X) should be 1, "
-                            "but the value given is %d.",
-                            x_dims[x_dims.size() - 1]));
-    }
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -114,7 +96,10 @@ Examples:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp,
-                             ops::ShardIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel<int>,
-                       ops::ShardIndexCPUKernel<int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor,
+                            PD_INFER_META(phi::ShardIndexInferMeta));
+REGISTER_OPERATOR(
+    shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShardIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
deleted file mode 100644
index 115b3f47d664ba00228343d221d5be70d13a7ff1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void ShardIndexInner(const T* in_data, T* out_data,
-                                const int64_t numel, const int index_num,
-                                const int nshards, const int shard_id,
-                                const int ignore_value) {
-  int shard_size = (index_num + nshards - 1) / nshards;
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
-    if (in_data[idx] / shard_size == shard_id) {
-      out_data[idx] = in_data[idx] % shard_size;
-    } else {
-      out_data[idx] = ignore_value;
-    }
-  }
-}
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class ShardIndexCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel<int>,
-                        ops::ShardIndexCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h
deleted file mode 100644
index c2fe3711686d4c4c802fadd66d4bc994232ef5ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class ShardIndexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    int shard_size = (index_num + nshards - 1) / nshards;
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    for (int64_t i = 0; i < numel; ++i) {
-      PADDLE_ENFORCE_GE(in_data[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be "
-                            "greater or equal to 0, but the value given is %d.",
-                            in_data[i]));
-      PADDLE_ENFORCE_LT(in_data[i], index_num,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be less "
-                            "than index_num (%d), but the value given is %d.",
-                            index_num, in_data[i]));
-      if (in_data[i] / shard_size == shard_id) {
-        out_data[i] = in_data[i] % shard_size;
-      } else {
-        out_data[i] = ignore_value;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index dc2e8ad58f31ce8fe845ecb1f368544704e1d9ad..c875448424a24e686b9a6285725f801d604abc46 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index a4e80343903d5a48dda584dc1f203782adb36787..016ff54645b02e9b3ddfb67595d830ccf5dcfd94 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -12,59 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "SigmoidCrossEntropyWithLogitsOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same rank."
-                          "But received: the rank of Input(X) is [%d], "
-                          "the rank of Input(Label) is [%d].",
-                          rank, labels_dims.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank),
-          phi::slice_ddim(labels_dims, 0, rank),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) shall have the same shape "
-              "except the last dimension. But received: the shape of "
-              "Input(X) is [%s], the shape of Input(Label) is [%s].",
-              x_dims, labels_dims));
-    }
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class SigmoidCrossEntropyWithLogitsGradOp
@@ -200,23 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(
+    sigmoid_cross_entropy_with_logits,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor,
+    PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta));
 REGISTER_OPERATOR(
     sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp,
     ops::SigmoidCrossEntropyWithLogitsOpMaker,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::framework::OpDesc>,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::imperative::OpBase>,
-    ops::SigmoidCrossEntropyWithLogitsInplaceInferer);
+    ops::SigmoidCrossEntropyWithLogitsInplaceInferer,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp,
                   ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             float>,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             double>);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
deleted file mode 100644
index 40476d5e11f6a3b0cad21038a3f342d824f3575c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#ifdef __HIPCC__
-static constexpr int kNumCUDAThreads = 256;
-#else
-static constexpr int kNumCUDAThreads = 512;
-#endif
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
-                                  const int ignore_index, const int limit,
-                                  T *out_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
-    if ((diff > -eps) && (diff < eps)) {
-      out_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
-    } else {
-      T term1 = (x > 0) ? x : 0;
-      T term2 = x * label;
-      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
-      out_data[i] = term1 - term2 + term3;
-      counts[i] = 1;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
-  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T in = 0;
-  for (int i = threadIdx.x; i < num; i += BlockDim) {
-    in += counts[i];
-  }
-  __syncthreads();
-  auto out =
-      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    T a = out > eps ? out : eps;
-    sum[0] = a;
-  }
-}
-
-template <typename T>
-__global__ void Div(T *loss, const int num, const T *norm) {
-  CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
-}
-
-template <typename T>
-__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
-                                   const int ignore_index, const T *dout_data,
-                                   const int limit, T *dx_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T dout = dout_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
-    if ((diff > -eps) && (diff < eps)) {
-      dx_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
-    } else {
-      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
-      T diff = simoid_x - label;
-      dx_data[i] = dout * diff;
-      counts[i] = 1;
-    }
-  }
-}
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.cuda_device_context();
-    bool normalize = context.Attr<bool>("normalize");
-
-    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
-    int limit = Out->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
-    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-
-    auto &dev_ctx = context.cuda_device_context();
-    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
-    int limit = dX->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
-        dx_data, counts);
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
deleted file mode 100644
index d2ced490ceff474e1e7624c591a9d142b4199c2f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-const int kIgnoreIndex = -100;
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-    int limit = Out->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        out_data[idx] = static_cast<T>(0.);
-      } else {
-        T term1 = (x > 0) ? x : 0;
-        T term2 = x * label;
-        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
-        out_data[idx] = term1 - term2 + term3;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-    int limit = dX->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    auto dout_data = dOut->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      T dout = dout_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        dx_data[idx] = static_cast<T>(0.);
-      } else {
-        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-        T diff = simoid_x - label;
-        dx_data[idx] = dout * diff;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 40852425997f0b1a9cfa0c86180f2f2254efceec..f186f95a2b96117fa56fc17f70d4d0884214af87 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
   // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
index 6395aa1caa01b9578d55e1155b0d6cd0d2295e36..c37731580d1212cb47c9e7f18aa4a9ba20af19d8 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
@@ -17,13 +17,15 @@
 #include <memory>
 #include <vector>
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index e2381c76f7e45a962fcacff079ca67df9610b6f1..ceb42dcf3e592182867a890bdfe73e237913ee53 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index e584c1a4cce1e85344c574526098b034723c3059..84b0f403be03893810ef592db9b2c993cc6b9644 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -44,8 +44,8 @@ Return the number of elements in the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
-                            PT_INFER_META(phi::SizeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
+                            PD_INFER_META(phi::SizeInferMeta));
 REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 3bc55fafd81e18d0a986268ff4692129c6515edc..3148b31a8322e2bab39ad7f723ee59a6db64c204 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index 1cd6f8b7698b949a8e198c766fcf193e13481298..34650c2e06245532eda5ebcf9e8d8454ee93237b 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -37,7 +37,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
                           "the mlu kernel of softmax_with_cross_entropy."));
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
 
     loss->mutable_data<T>(ctx.GetPlace());
     backprop->mutable_data<T>(ctx.GetPlace());
@@ -45,10 +45,10 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
 
     // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
     const int cnnl_softmax_dims = 3;
-    const int d1 = SizeToAxis(axis, logits->dims());
+    const int d1 = phi::funcs::SizeToAxis(axis, logits->dims());
     const int d2_logits = logits->dims()[axis];
     const int d2_labels = labels->dims()[axis];
-    const int d3 = SizeOutAxis(axis, logits->dims());
+    const int d3 = phi::funcs::SizeOutAxis(axis, logits->dims());
 
     // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
     // possible.
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index 2bc5124843c38152d2f5d3ffcef5a5ca24534bfd..a60ec5a4df52b8275a17185a63c8a7d27dd8132b 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -23,9 +23,9 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
 #endif
@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
       std::vector<int> pads(rank * 2, 0);
       pads[axes.back() * 2 + 1] = zero_length;
 
-      paddle::operators::math::PaddingFunctor<DeviceContext, C>(
-          rank, ctx, pads, static_cast<C>(0), *dy, &full_dy);
+      phi::funcs::PaddingFunctor<DeviceContext, C>(
+          rank, ctx.template device_context<DeviceContext>(), pads,
+          static_cast<C>(0), *dy, &full_dy);
       fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
                    !forward);
     }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index a8f05d94563e57a20cc41ba1edd68872d869d00e..5b8922505cc089d66f0b444fc65ccec8ed051876 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 956544c53609eb29326dc5cf295d978d767ac176..d61f5aa3f634cd2aee1e5c2f34f4467b1697e455 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index c92d468f3462c92cd0631383996012afb6edb46b..af29aac6b9052877283271abc12f4dc1da6b8a3e 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
       memory::Copy(npu_place, dst + i * dst_after, npu_place,
                    src + i * src_after, sizeof(T) * size, npu_ctx.stream());
+#elif defined(PADDLE_WITH_MLU)
+      auto& mlu_place = place;
+      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
+      memory::Copy(mlu_place, dst + i * dst_after, mlu_place,
+                   src + i * src_after, sizeof(T) * size, mlu_ctx.stream());
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "Paddle is not compiled with GPU."));
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 3e2d2a5495b3428ce0fad9d61431d53b44eea330..33590c1d7cca04e215e55abb26fb2aa3c3b61bec 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index bcb3ee44f04657f1afcb9e85dbc01fde71562c39..166f49999d552917021a545b2799ae33ff257f06 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -105,7 +105,7 @@ struct RealMulComplexFunctor {
                                         "The image part of y must to be 0"
                                         "but got [%d]",
                                         y.imag));
-    return platform::complex<phi::funcs::Real<T>>(x.real * y.real,
+    return platform::complex<phi::dtype::Real<T>>(x.real * y.real,
                                                   x.imag * y.real);
   }
 };
@@ -391,11 +391,11 @@ struct DeviceIndependenceTensorOperations {
   // batch_diag for CPU only
   Tensor BatchDiag(const Tensor& x, int batch) {
     Tensor out;
-    auto* x_data = x.data<phi::funcs::Real<T>>();
+    auto* x_data = x.data<phi::dtype::Real<T>>();
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<phi::funcs::Real<T>>(
+    auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+        static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
     auto x_dims = x.dims();
     int num_dims = x_dims.size();
@@ -661,9 +661,9 @@ struct DeviceIndependenceTensorOperations {
   Tensor Real(const Tensor& x) {
     Tensor out;
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<phi::funcs::Real<T>>(
+    auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+        static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(numel);
     phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index f5e451ac7054d15919170f06f4fd225a2243f5d1..42a847206a3cb6fecc421effa9e9d10bacc80be4 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel<T> {
     int col_u = full ? rows : k;
     int col_v = full ? cols : k;
     int batches = numel / (rows * cols);
-    auto* U_out = U->mutable_data<phi::funcs::Real<T>>(
+    auto* U_out = U->mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batches * rows * col_u * sizeof(phi::funcs::Real<T>)));
-    auto* VH_out = VH->mutable_data<phi::funcs::Real<T>>(
+        size_t(batches * rows * col_u * sizeof(phi::dtype::Real<T>)));
+    auto* VH_out = VH->mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batches * col_v * cols * sizeof(phi::funcs::Real<T>)));
-    auto* S_out = S->mutable_data<phi::funcs::Real<T>>(
-        context.GetPlace(), size_t(batches * k * sizeof(phi::funcs::Real<T>)));
+        size_t(batches * col_v * cols * sizeof(phi::dtype::Real<T>)));
+    auto* S_out = S->mutable_data<phi::dtype::Real<T>>(
+        context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real<T>)));
     /*SVD Use the Eigen Library*/
     math::BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
   }
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
index 664f1031915e4661769d9b2844c5388f0efa91c0..fa8a5e92712ec86a01ca01b7eb644e289c03000a 100644
--- a/paddle/fluid/operators/take_along_axis_op.cc
+++ b/paddle/fluid/operators/take_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/take_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp,
                   ops::TakeAlongAxisGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel<float>,
-                       ops::TakeAlongAxisOpKernel<double>,
-                       ops::TakeAlongAxisOpKernel<int>,
-                       ops::TakeAlongAxisOpKernel<uint8_t>,
-                       ops::TakeAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis_grad,
-                       ops::TakeAlongAxisGradOpKernel<float>,
-                       ops::TakeAlongAxisGradOpKernel<double>,
-                       ops::TakeAlongAxisGradOpKernel<int>,
-                       ops::TakeAlongAxisGradOpKernel<uint8_t>,
-                       ops::TakeAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
deleted file mode 100644
index b6c62d497b379dda568f661b31366914e6870a7c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/take_along_axis_op.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/take_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TakeAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel<float>,
-                        ops::TakeAlongAxisCUDAKernel<double>,
-                        ops::TakeAlongAxisCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisCUDAKernel<int>,
-                        ops::TakeAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(take_along_axis_grad,
-                        ops::TakeAlongAxisGradOpCUDAKernel<float>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<double>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
deleted file mode 100644
index fc781dbddf2ad25de3728e76d231d0164d46c08e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/take_along_axis_op.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class TakeAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index e05b4de65214c8cf55d099fccc7c18370b2312b7..0a71875d8931ef80846aa7e0c95ce1beab86fd7c 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -79,6 +79,28 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
           model_input_shape_str, runtime_input_shape_str));
 }
 
+static paddle::experimental::DataType TRT2FluidDataType(
+    nvinfer1::DataType type) {
+  switch (type) {
+    case nvinfer1::DataType::kFLOAT:
+      return paddle::experimental::DataType::FLOAT32;
+    case nvinfer1::DataType::kINT32:
+      return paddle::experimental::DataType::INT32;
+    case nvinfer1::DataType::kHALF:
+      return paddle::experimental::DataType::FLOAT16;
+    case nvinfer1::DataType::kINT8:
+      return paddle::experimental::DataType::INT8;
+#if IS_TRT_VERSION_GE(7000)
+    case nvinfer1::DataType::kBOOL:
+      return paddle::experimental::DataType::BOOL;
+#endif
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unknown fluid datatype in Fluid op converter"));
+      return paddle::experimental::DataType::FLOAT32;
+  }
+}
+
 static void RuntimeDynamicShapeCheck(
     const std::string &x, const std::vector<int32_t> &runtime_input_shape,
     const std::vector<int32_t> &min_input_shape,
@@ -520,9 +542,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
         buffers[bind_index] = static_cast<void *>(t.data<int64_t>());
       } else if (type == framework::proto::VarType::INT32) {
         buffers[bind_index] = static_cast<void *>(t.data<int32_t>());
+      } else if (type == framework::proto::VarType::FP16) {
+        buffers[bind_index] = static_cast<void *>(t.data<float16>());
       } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "The TRT Engine OP only support float/int32_t/int64_t input."));
+        PADDLE_THROW(
+            platform::errors::Fatal("The TRT Engine OP only support "
+                                    "float/int32_t/int64_t/float16 input."));
       }
     }
 
@@ -570,9 +595,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
                             "than the number of bindings, but got binding "
                             "index = %d, number of bindings = %d.",
                             bind_index, num_bindings));
-      buffers[bind_index] =
-          static_cast<void *>(fluid_t->mutable_data<float>(dev_place));
-
+      auto trt_type = engine->engine()->getBindingDataType(bind_index);
+      // get adr and set type
+      buffers[bind_index] = static_cast<void *>(
+          fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type)));
       output_index += 1;
     }
 
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f..1de1b590a1311b81f16ba05e746402e1fc14c556 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/phi/core/ddim.h"
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(softmax);
 
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index dc12f8e8892a022c6f55f4fe3a6237a7a01fa290..e179149c5bb77bd642f744be48109a941c66febf 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile");
-    auto x_dims = ctx->GetInputDim("X");
-    auto repeat_times = ctx->Attrs().Get<std::vector<int>>("repeat_times");
-    if (repeat_times.size() == 0) {
-      repeat_times = std::vector<int>(x_dims.size(), -1);
-    }
-
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        repeat_times.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times.size()));
-    PADDLE_ENFORCE_GE(
-        repeat_times.size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must be positive integers, but the value received is %d.",
-            repeat_times.size()));
-
-    auto out_rank =
-        std::max(static_cast<size_t>(x_dims.size()), repeat_times.size());
-    std::vector<int64_t> out_shape(out_rank);
-    auto x_dim_vec = phi::vectorize<int>(x_dims);
-    if (x_dim_vec.size() > repeat_times.size()) {
-      auto diff = x_dim_vec.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, -1);
-    } else {
-      auto diff = repeat_times.size() - x_dim_vec.size();
-      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
-    }
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      if (x_dim_vec[i] == -1 || repeat_times[i] == -1) {
-        out_shape[i] = -1;
-      } else {
-        PADDLE_ENFORCE_GT(
-            repeat_times[i], 0,
-            platform::errors::InvalidArgument(
-                "Every element of the input 'repeat_times' for tile op must be "
-                "greater than 0, but the value given is %d.",
-                repeat_times[i]));
-        out_shape[i] = x_dim_vec[i] * repeat_times[i];
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
-    if (out_shape[0] == x_dims[0]) {
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor,
+                            PD_INFER_META(phi::TileInferMeta));
+
 REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TileGradOpMaker<paddle::imperative::OpBase>);
+                  ops::TileGradOpMaker<paddle::imperative::OpBase>,
+                  TileInferMetaFunctor);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
                   ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-#endif
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
deleted file mode 100644
index 1698b5e3c6322e2cd9cbe7cf4839e2fc08627b32..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tile_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-inline std::vector<int> get_repeat_times(
-    const framework::ExecutionContext& ctx) {
-  if (ctx.HasInput("RepeatTimes")) {
-    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
-    auto* repeat_data = repeat_tensor->data<int>();
-    framework::Tensor cpu_repeat_tensor;
-    if (platform::is_gpu_place(repeat_tensor->place()) ||
-        platform::is_xpu_place(repeat_tensor->place()) ||
-        platform::is_npu_place(repeat_tensor->place())) {
-      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
-                                        &cpu_repeat_tensor);
-      repeat_data = cpu_repeat_tensor.data<int>();
-    }
-    auto vec_repeat_times =
-        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
-    return vec_repeat_times;
-  }
-
-  auto list_repeat_times_tensor =
-      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
-  if (list_repeat_times_tensor.size() > 0) {
-    // get tensor from
-    std::vector<int> vec_repeat_times;
-    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
-      auto tensor = list_repeat_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place()) ||
-          platform::is_xpu_place(tensor->place()) ||
-          platform::is_npu_place(tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_repeat_times.push_back(*temp.data<int32_t>());
-      } else {
-        vec_repeat_times.push_back(*tensor->data<int32_t>());
-      }
-    }
-    return vec_repeat_times;
-  } else {
-    return ctx.Attr<std::vector<int>>("repeat_times");
-  }
-}
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::To32BitIndex;
-
-template <typename DeviceContext, typename T>
-class TileKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
-    PADDLE_ENFORCE_LE(
-        rank, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size, 1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times_size));
-    rank = std::max(rank, repeat_times_size);
-    switch (rank) {
-      case 1:
-        Tile<1>(context);
-        break;
-      case 2:
-        Tile<2>(context);
-        break;
-      case 3:
-        Tile<3>(context);
-        break;
-      case 4:
-        Tile<4>(context);
-        break;
-      case 5:
-        Tile<5>(context);
-        break;
-      case 6:
-        Tile<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto repeat_times = get_repeat_times(context);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i], 0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(), vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(), repeat_times.size()));
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    // use 32-bit index to speed up
-    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
-    if (use_32bit_index) {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
-    } else {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                   bcast_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TileGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto repeat_times = get_repeat_times(context);
-    auto x_dims = x->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    // 1. reshape_dims_vec is the broadcast parameter.
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-      dx->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*dout, context.GetPlace(), context.device_context(),
-                            dx);
-      // TensorCopy may change the dims of dx
-      dx->Resize(x_dims);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "Th rank of the input 'Out@GRAD' for tile_grad op "
-                            " must be greater than or equal to 1, but "
-                            "the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for tile_grad op "
-                            "must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void TileBackward(const framework::ExecutionContext& context,
-                    const std::vector<int>& reshape_dims_vec,
-                    const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..95bfb9f4e1a9d374c66997567f5d80df8b5d8701
--- /dev/null
+++ b/paddle/fluid/operators/tile_op_functor.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace paddle {
+namespace operators {
+
+inline std::vector<int> get_repeat_times(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
+    auto* repeat_data = repeat_tensor->data<int>();
+    framework::Tensor cpu_repeat_tensor;
+    if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_xpu_place(repeat_tensor->place()) ||
+        platform::is_npu_place(repeat_tensor->place())) {
+      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
+                                        &cpu_repeat_tensor);
+      repeat_data = cpu_repeat_tensor.data<int>();
+    }
+    auto vec_repeat_times =
+        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
+    return vec_repeat_times;
+  }
+
+  auto list_repeat_times_tensor =
+      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+  if (list_repeat_times_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_repeat_times;
+    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
+      auto tensor = list_repeat_times_tensor[i];
+      if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_xpu_place(tensor->place()) ||
+          platform::is_npu_place(tensor->place())) {
+        framework::Tensor temp;
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_repeat_times.push_back(*temp.data<int32_t>());
+      } else {
+        vec_repeat_times.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_repeat_times;
+  } else {
+    return ctx.Attr<std::vector<int>>("repeat_times");
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 9e306c7be537bc7403812f4907541e1a9671c12a..cea6b458aec782923722cb37fe41c1c4d59292e5 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
index 6b60b167a2465fcb03d8ec088cfa288f9fb14af1..598377587d6f73e0c21abbc4d3819d16eacb1f23 100644
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -11,11 +11,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class TileXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index d60976928e00cb5ecfde6ca65e0a1b0d5b1ef938..80c9935057cb5d5809fde545bdd0772afdaf2702 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -51,6 +51,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
 struct SegmentOffsetIter {
   EIGEN_DEVICE_FUNC
   explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 810afc901df57bfa3c518b2363fb9153ee353762..d1add111e1d24cb711955a9aff06eb19feb35dc9 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
-
-REGISTER_OP_CPU_KERNEL(top_k_v2,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
-
-REGISTER_OP_CPU_KERNEL(
-    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
deleted file mode 100644
index 84d8eef53bf72c5dbd5404a889925541374c9823..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-template <typename DeviceContext, typename T>
-class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-
-    // get the attributes
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      Tensor k_host;
-      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
-      k = k_host.data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    const auto& out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      // if get the topK from the last axis
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
-                        indices, largest)) {
-          // Successed, return.
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      // NOTE: pass lds and dim same to input width.
-      // NOTE: old matrix implementation of stride is different to eigen.
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-    } else {
-      // if get topK not from the last axis, will tranpose the tensor and get
-      // TopK
-
-      // first step, prepare the trans args for the tranpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-      // second step, tranpose the input
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      // third step, calcluate the topk
-      // allocate the tmp cuda memory for the tmp result
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      Tensor trans_out;
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                        &trans_out, &trans_ind, largest)) {
-          // last step, tranpose back the indices and output
-          TransCompute<platform::CUDADeviceContext, int64_t>(
-              ndims, dev_ctx, trans_ind, indices, trans);
-          TransCompute<platform::CUDADeviceContext, T>(
-              ndims, dev_ctx, trans_out, output, trans);
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-    }
-  }
-};
-
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-template <typename DeviceContext, typename T>
-class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // get the real the axis and the k
-    if (axis < 0) axis += in_dims.size();
-    const int& k = out_dims[axis];
-    const int& raw_height = in_dims[axis];
-
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    auto ComputeBlockSize = [](int col) {
-      if (col > 512)
-        return 1024;
-      else if (col > 256 && col <= 512)
-        return 512;
-      else if (col > 128 && col <= 256)
-        return 256;
-      else if (col > 64 && col <= 128)
-        return 128;
-      else
-        return 64;
-    };
-    int block_size = ComputeBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-
-    // lanuch the cuda kernel to assign the grad
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          double>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
-                       paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int64_t>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
deleted file mode 100644
index a808207476f3b9be2636741d7b0ac06002ccba08..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-  The reason why we need the topk v2 is because the compatibility. We redefine
-  the NaN is maximum value
-  in the process of comparing. If do not add the topk v2,  will affect the
-  inference result of model that traing
-  by the older version paddlepaddle.
-*/
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopK(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     const int& k, const bool& largest, const bool& sorted) {
-  // when the k is small, will the partial sort
-  bool partial_sort_flag = (k * 64) < input_width;
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            if (largest) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            } else {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            }
-          });
-    } else {
-      // use the nth-element to get the K-larger or K-small element
-      if (largest) {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
-                    [&largest](const std::pair<T, Type>& l,
-                               const std::pair<T, Type>& r) {
-                      return (std::isnan(static_cast<double>(l.first)) &&
-                              !std::isnan(static_cast<double>(r.first))) ||
-                             (l.first > r.first);
-                    });
-        }
-      } else {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(
-              col_vec.begin(), col_vec.begin() + k - 1,
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-        }
-      }
-    }
-    for (Type j = 0; j < k; ++j) {
-      t_out[i * k + j] = col_vec[j].first;
-      t_indices[i * k + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopKAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data,
-                           const int& k) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TopkV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Get the top k elements of each row of input tensor
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto* indices = context.Output<Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    // if K tensor is not null, will the use K tesnor as k
-    auto* k_t = context.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      // accroding to axis to set K value in the dim
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    const auto& out_dims = output->dims();
-    if (axis + 1 == in_dims.size()) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           output_data, indices_data, k, largest, sorted);
-    } else {
-      // if the topk dims is not last dim, will tranpose and do topk
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      // Allocate the temp tensor to the save the topk indices, values
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      Tensor tmp_indices;
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      // get the TopK value
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, k, largest, sorted);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TopkV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    const size_t& k = out_dims[axis];
-
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis + 1 == in_dims.size()) {
-      // allocate the memory for the input_grad
-
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
-                     indices, x_grad_data, k);
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      // transpose the out_grad, indices
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, *indices, &trans_ind, trans);
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out, k);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 5b8a6b3e75449508afa5d316d81f97ab815c9ea9..caaae02124c926b9e4be08744e4192dab20ca5d0 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index e11070638834c46a6628d652216e1ddddeb2487d..dff5c2d3f39378486bb5d2f8010d005d57b20550 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc
index 49daac2ff0da63c542a807dc97925c6989559f14..4d9c39be92eff029e66cdde900318b045c2b531f 100644
--- a/paddle/fluid/operators/top_k_v2_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 63b914a31a86aef48e952a4877c7beb670075cc4..c6c0fa3c0019eac742a9c70ea53a438f5a474895 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2.
 )DOC");
   }
 };
-class TraceOpGrad : public framework::OperatorWithKernel {
+class TraceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -107,14 +107,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
-                            PT_INFER_META(phi::TraceInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
+                            PD_INFER_META(phi::TraceInferMeta));
 REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::framework::OpDesc>,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
                   TraceInferShapeFunctor);
 
-REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad,
+REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 768ab21936f1efbd2f50470446fd3f8d3ecb094c..1a297e7238ccdacd9b4986a5fe69e155d30e4318 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class TransposeGradInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SyncTypeAndDataType(framework::GradVarName("Out"),
+                             framework::GradVarName("X"));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -347,59 +355,13 @@ REGISTER_OPERATOR(
     transpose, ops::TransposeOp, ops::TransposeOpMaker,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad,
+                  ops::TransposeGradInferVarType);
 
 REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
                   ops::Transpose2GradMaker<paddle::framework::OpDesc>,
                   ops::Transpose2GradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad,
+                  ops::TransposeGradInferVarType,
                   ops::Transpose2DoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Transpose2DoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
deleted file mode 100644
index 02e224549a5abfb14729355addeb52824e450570..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/transpose_op.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/transpose_op.cu.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TransposeGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.InputVar("X");
-    auto* out = context.OutputVar("Out");
-
-    const framework::Tensor* x_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*x);
-    framework::Tensor* out_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    if (out_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransposeGPUKernelDriver<T>(dev_ctx, ndims, *x_tensor, axis, out_tensor);
-  }
-};
-template <typename DeviceContext, typename T>
-class TransposeGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
-    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
-    if (!x_grad) {
-      return;
-    }
-
-    const framework::Tensor* out_grad_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
-    framework::Tensor* x_grad_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
-
-    x_grad_tensor->mutable_data<T>(context.GetPlace());
-    if (x_grad_tensor->numel() == 0) {
-      return;
-    }
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransposeGPUKernelDriver<T>(dev_ctx, ndims, *out_grad_tensor, reversed_axis,
-                                x_grad_tensor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose_grad,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::bfloat16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<float>>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose2,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::bfloat16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<float>>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index b542fa37f88fd3e4d53b475d49d8f0491b9b5b42..a31ac28c9910c0c36b28c98fd3d83476f002df7e 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,8 +16,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -258,10 +259,10 @@ struct SystemElemType<16> {
 };
 
 template <typename T, int tile_long, int tile_short>
-void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
-                                      int tile_size_i, int tile_size_j,
-                                      int total_tiles_count, const T* input,
-                                      const Dim3& input_dims, T* output) {
+void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
+                                      int tile_size_j, int total_tiles_count,
+                                      const T* input, const Dim3& input_dims,
+                                      T* output) {
   constexpr int NumThreads = tile_long;
   if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
     TilingSwapDim1And2<
@@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
 
 template <typename T, int tile_long, int tile_short, typename dummy = void>
 struct NarrowDims2TransposeDispatch {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch<
     T, tile_long, tile_short,
     typename std::enable_if<
         CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch<
     T, tile_long, tile_short,
     typename std::enable_if<CheckLongTileSize(tile_long, tile_short, sizeof(T)),
                             void>::type> {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch<
 };
 
 template <typename T, bool conjugate = false>
-void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input,
+void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
                           const Dim3& input_dims, T* output,
                           const int kMinTileSize) {
   // First get available tile sizes for the data type requested as backups
@@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
 
 // Here suppose convert all tensor to dim3, so just change dim1 and 2.
 template <typename T>
-void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
-                                 const T* input, const Dim3& input_dims,
-                                 T* output) {
+void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
+                                 const Dim3& input_dims, T* output) {
   // Suppose tile size > 16
   static const int kMinTileSize = 16;
   static const int kMinNarrowTileSize = 96;
@@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
   } else {
     // If input shape is small, such as 8X8, just do simple copy
     int total_elements = input_dims[0] * input_dims[1] * input_dims[2];
-    auto config = GetGpuLaunchConfig1D(d, total_elements);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements);
     TransposeSimpleKernel<T, 0, 2, 1><<<
         config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
         total_elements, input, input_dims, output);
@@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
 
 template <typename T>
 struct SwapDim1And2InTranspose {
-  typedef platform::CUDADeviceContext Device;
+  typedef phi::GPUContext Device;
   void operator()(const Device& d, const T* in,
                   const std::vector<int>& combined_dims, T* out) {
     Dim3 input_dims = {static_cast<int>(combined_dims[0]),
@@ -533,7 +533,7 @@ struct SwapDim1And2InTranspose {
 
 template <typename T>
 struct SwapDim0And2InTranspose {
-  typedef platform::CUDADeviceContext Device;
+  typedef phi::GPUContext Device;
   void operator()(const Device& d, const T* in,
                   const std::vector<int>& combined_dims, T* out) {
     Dim3 input_dims = {static_cast<int>(combined_dims[0]),
@@ -541,7 +541,7 @@ struct SwapDim0And2InTranspose {
                        static_cast<int>(combined_dims[2])};
 
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
-    auto config = GetGpuLaunchConfig1D(d, total_size);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size);
 
     TransposeSimpleKernel<T, 2, 1, 0><<<
         config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
@@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
 
 template <typename T>
 struct TransposeSimple {
-  static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in,
+  static bool run(const phi::GPUContext& ctx, const Tensor& in,
                   const std::vector<int32_t> perm, Tensor* out) {
     // First reduce the dimensions of the input tensor if possible.
     std::vector<int> new_perm;
@@ -654,12 +654,12 @@ struct TransposeSimple {
 };
 
 template <typename T>
-void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              const int ndims, const Tensor& in,
-                              const std::vector<int32_t> perm, Tensor* out) {
+void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims,
+                              const Tensor& in,
+                              const std::vector<int32_t>& perm, Tensor* out) {
   auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
   if (!ret) {
-    TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, in, out, perm);
+    TransCompute<phi::GPUContext, T>(ndims, dev_ctx, in, out, perm);
   }
 }
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index ec05a534c0ef5327ec5d6d7f89b4e16b7a829434..a9e4876cc82a44ef8e87049a199ce0b58a96f6ea 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -59,63 +59,5 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
   }
 }
 
-template <typename DeviceContext, typename T>
-class TransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.InputVar("X");
-    auto* out = context.OutputVar("Out");
-
-    const framework::Tensor* x_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*x);
-    framework::Tensor* out_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    if (out_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
-    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
-
-    if (!x_grad) {
-      return;
-    }
-    const framework::Tensor* out_grad_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
-    framework::Tensor* x_grad_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
-
-    x_grad_tensor->mutable_data<T>(context.GetPlace());
-    if (x_grad_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
-                                   x_grad_tensor, reversed_axis);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40cb22bab50ec0de5cc0fb9a2c6953637a238599
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_mlu.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class TransposeMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+
+    TransposeFromMLUTensor<T>(ctx, axis, x, out,
+                              false /*need_reshape_or_alloc*/);
+  }
+};
+
+template <typename T>
+class TransposeGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    TransposeFromMLUTensor<T>(ctx, reversed_axis, out_grad, x_grad,
+                              false /*need_reshape_or_alloc*/);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(transpose2, ops::TransposeMLUKernel<float>,
+                       ops::TransposeMLUKernel<paddle::platform::float16>,
+                       ops::TransposeMLUKernel<int>,
+                       ops::TransposeMLUKernel<int16_t>,
+                       ops::TransposeMLUKernel<uint8_t>,
+                       ops::TransposeMLUKernel<int8_t>,
+                       ops::TransposeMLUKernel<bool>);
+
+REGISTER_OP_MLU_KERNEL(transpose2_grad, ops::TransposeGradMLUKernel<float>,
+                       ops::TransposeGradMLUKernel<paddle::platform::float16>,
+                       ops::TransposeGradMLUKernel<int>,
+                       ops::TransposeGradMLUKernel<int16_t>,
+                       ops::TransposeGradMLUKernel<uint8_t>,
+                       ops::TransposeGradMLUKernel<int8_t>,
+                       ops::TransposeGradMLUKernel<bool>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index cce3f188c8b7429447309e989e1e0dd5b9f13be0..fb39034c8e92c1ac39aa1ca6e57d5a08ca1ca9d6 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(transpose2);
+USE_OP_ITSELF(transpose2);
 USE_OP_DEVICE_KERNEL(transpose2, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index 9233917b0931b98d30b736ec9b69fd68c0604d18..df84659a00f4c4220853404a8b28c6ccc93623a3 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/solve_op.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,58 +25,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto x_dims_n = x_dims.size();
-    auto y_dims_n = y_dims.size();
-
-    PADDLE_ENFORCE_GE(
-        x_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor X's dimensions of TriangularSolveOp "
-                         "should be >= 2. But received X's "
-                         "dimensions = %d, X's shape = [%s]",
-                         x_dims.size(), x_dims));
-
-    PADDLE_ENFORCE_GE(
-        y_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor Y's dimensions of TriangularSolveOp "
-                         "should be >=2. But received Y's "
-                         "dimensions = %d, Y's shape = [%s]",
-                         y_dims.size(), y_dims));
-
-    PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
-
-    std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-    std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
-
-    std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
-                                        x_dims_vec.end() - 2);
-    std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
-                                        y_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
-
-    std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
-    y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2],
-                                                     y_dims_vec[y_dims_n - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
     return framework::OpKernelType(
@@ -168,20 +119,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor,
+                            PD_INFER_META(phi::TriangularSolveInferMeta));
+
 REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp,
                   ops::TriangularSolveOpMaker,
                   ops::TriangularSolveOpInferVarType,
                   ops::TriangularSolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>,
+                  TriangularSolveInferShapeFunctor);
 
 REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
deleted file mode 100644
index 7df98517e8418905f0f8c8ce603762967a8b5f38..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
index 4e68add096ff28f5378b02689248c3957c1e8ae9..fd46aca456cd9bd883cf9d1ce3576b307794b1a5 100644
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ b/paddle/fluid/operators/triangular_solve_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
 namespace paddle {
@@ -30,10 +29,10 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
-static void triangular_solve(const DeviceContext& context, const Tensor& x,
-                             const Tensor& y, Tensor* out, bool upper,
+static void triangular_solve(const DeviceContext &context, const Tensor &x,
+                             const Tensor &y, Tensor *out, bool upper,
                              bool transpose, bool unitriangular) {
-  // Tensor broadcast use eigen
+  // Tensor broadcast use eigen library
   std::vector<int64_t> x_bst_dims_vec;
   std::vector<int64_t> y_bst_dims_vec;
   std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
@@ -61,169 +60,5 @@ static void triangular_solve(const DeviceContext& context, const Tensor& x,
           unitriangular);
 }
 
-template <typename DeviceContext, typename T>
-class MatrixReduceSumFunctor {
- public:
-  void operator()(const Tensor& input, Tensor* output,
-                  const framework::ExecutionContext& ctx);
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-    out->Resize(phi::make_ddim(out_bst_dims));
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-
-    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
-        &in, out, out_reduce_dims, true, false, ctx)
-        .template apply<T>();
-    out->Resize(phi::make_ddim(out_dims));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TriangularSolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    triangular_solve<DeviceContext, T>(dev_ctx, *x, *y, out, upper, transpose,
-                                       unitriangular);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TriangularSolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    const auto* out = ctx.Input<framework::Tensor>("Out");
-    const auto* dout =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    std::vector<int64_t> x_bst_dims_vec;
-    std::vector<int64_t> y_bst_dims_vec;
-    std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y);
-
-    Tensor dy_bst(y->type());
-    if (dy) {
-      dy->mutable_data<T>(y->dims(), dev_ctx.GetPlace());
-      dy_bst.Resize(phi::make_ddim(y_bst_dims_vec));
-      dy_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate x's conjugate for complex
-      Tensor x_conj(x->type());
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, x->numel());
-      phi::funcs::ConjFunctor<T> x_functor(
-          x->data<T>(), x->numel(),
-          x_conj.mutable_data<T>(x->dims(), dev_ctx.GetPlace()));
-      x_for_range(x_functor);
-
-      // reuse forward to get dy_bst, and the result has been broadcated.
-      triangular_solve<DeviceContext, T>(dev_ctx, x_conj, *dout, &dy_bst, upper,
-                                         !transpose, unitriangular);
-
-      if (dy_bst.dims() == dy->dims()) {
-        framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dy_bst, dy, ctx);
-        dy->Resize(y->dims());
-      }
-    }
-
-    Tensor dx_bst(x->type());
-    if (dx) {
-      dx->mutable_data<T>(x->dims(), dev_ctx.GetPlace());
-      dx_bst.Resize(phi::make_ddim(x_bst_dims_vec));
-      dx_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate out's conjugate for complex
-      Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-      if (transpose) {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
-        blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      } else {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
-        blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      }
-
-      Tensor dx_bst_upper(x->type());
-      // get upper or lower triangular
-      dx_bst_upper.Resize(dx_bst.dims());
-      dx_bst_upper.mutable_data<T>(dev_ctx.GetPlace());
-
-      const auto& dims = dx_bst.dims();
-      const auto H = dims[dims.size() - 2];
-      const auto W = dims[dims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, dx_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(dx_bst.data<T>(), unitriangular,
-                                            !upper, H, W,
-                                            dx_bst_upper.data<T>());
-      x_for_range(tril_triu_computer);
-
-      if (dx_bst_upper.dims() == dx->dims()) {
-        framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dx_bst_upper, dx, ctx);
-        dx->Resize(x->dims());
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb
--- /dev/null
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  Licensed under
+the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TrilTriuXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::Tensor>("X");
+    const auto* x_data = x->data<T>();
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+
+    const int diagonal = context.Attr<int>("diagonal");
+    const bool lower = context.Attr<bool>("lower");
+    auto xshape = phi::vectorize<int>(x->dims());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    if (lower) {
+      r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+    } else {
+      r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    tril_triu, ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc
index 54f4deac80a74e2e471036c2e25d08a582e29a9d..b77775f5a8c094fc7aa05f2f017834681424207f 100644
--- a/paddle/fluid/operators/trunc_op.cc
+++ b/paddle/fluid/operators/trunc_op.cc
@@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker,
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 6eb7f922dfdbec41aa1c47d11e1decc259d08689..dc5a66dce16d698f9cfac01e3bdc776d08c2af19 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -17,8 +17,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound(
-            "Output(Out) of TruncatedGaussianRandomOp should not be null."));
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> out_dim;
-    out_dim.reserve(shape.size());
-    for (auto dim : shape) {
-      out_dim.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE_GT(
-        shape.size(), 0UL,
-        platform::errors::InvalidArgument(
-            "the input shape of TruncatedGaussianRandomOp must be set, "
-            "But the rank of shape we received is %d",
-            shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dim));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
-                             ops::TruncatedGaussianRandomOp,
-                             ops::TruncatedGaussianRandomOpMaker);
+
+DECLARE_INFER_SHAPE_FUNCTOR(
+    truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor,
+    PD_INFER_META(phi::TruncatedGaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    truncated_gaussian_random, ops::TruncatedGaussianRandomOp,
+    ops::TruncatedGaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TruncatedGaussianRandomInferShapeFunctor);
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index c45b839d5b40bd1d0db25743406bb8cc319f1280..02fed3de6cef74f19a5dd4d8500017e6097a56a4 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
-                            PT_INFER_META(phi::UnfoldInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
+                            PD_INFER_META(phi::UnfoldInferMeta));
 REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
                   ops::UnfoldGradMaker<paddle::framework::OpDesc>,
                   ops::UnfoldGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 353d653f48141b2e68db6143c1ca0859a9ecc13f..1c22e60fa87aa73246806e4f5bc70e49a3b0f958 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(
-    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
-    paddle::operators::CPUUniformRandomKernel<double>,
-    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     uniform_random_batch_size_like,
     paddle::operators::CPUUniformRandomKernel<float>,
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index fb38a6aded4cf173bb4c0dd96d131ff520b6701e..2ceb8a68d863dfe71458c67deeac7f54df0a662b 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
                         paddle::operators::GPUUniformRandomKernel<float>,
                         paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index a864c48ad757411861b6d2b3be40361c347601f8..b941dc21c3ab213e5abc2c4c908413b2b6222c41 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -25,8 +25,9 @@ DECLARE_bool(use_curand);
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #endif
 
 namespace paddle {
@@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context,
   if (gen_cuda->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
       using MT = typename details::MPTypeTrait<T>::Type;
-      distribution::uniform_distribution<MT> dist;
-      distribution::uniform_transform<MT> trans(min, max);
-      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+      phi::funcs::uniform_distribution<MT> dist;
+      phi::funcs::uniform_real_transform<MT> trans(min, max);
+      phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
     } else {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
       auto func =
           UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
                                     diag_step, diag_val, gen_offset);
-      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
     }
   } else {
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
-    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+    phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
   }
 }
 #endif
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 1600bedc6b2fae9ba65a32e831eae4f43abeddf8..2c5f13f5a930788651c2e287febab7ad06aefd20 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/uniform_random_op.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -57,14 +58,45 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
 
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
-    const float min = static_cast<T>(ctx.Attr<float>("min"));
-    const float max = static_cast<T>(ctx.Attr<float>("max"));
+
+    Tensor cpu_tensor(tensor->dtype());
+    cpu_tensor.Resize(tensor->dims());
+    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    // make mlu seed
-    MLUCnnlRandomGeneratorDesc random_desc(/*is_mlu200=*/false, seed);
-    cnnlDataType_t data_type = ToCnnlDataType(tensor->type());
-    MLUCnnl::RandomUniform(ctx, size, /*data type=*/data_type,
-                           random_desc.get(), min, max, GetBasePtr(tensor));
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data_cpu[i] = dist(*engine);
+    }
+
+    unsigned int diag_num =
+        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
+    unsigned int diag_step =
+        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
+    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
+    if (diag_num > 0) {
+      PADDLE_ENFORCE_GT(
+          size, (diag_num - 1) * (diag_step + 1),
+          platform::errors::InvalidArgument(
+              "ShapeInvalid: the diagonal's elements is equal (num-1) "
+              "* (step-1) with num %d, step %d,"
+              "It should be smaller than %d, but received %d",
+              diag_num, diag_step, (diag_num - 1) * (diag_step + 1), size));
+      for (int64_t i = 0; i < diag_num; ++i) {
+        int64_t pos = i * diag_step + i;
+        data_cpu[pos] = diag_val;
+      }
+    }
+
+    // copy to MLU
+    framework::TensorCopy(
+        cpu_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), tensor);
+    ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 5ab2004617810b34276632fa487e8f12d7b3b915..1be8f3387dbad85e0dce3593ad61b9c116b10ef0 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -236,7 +236,6 @@ register_unity_group(cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
-    segment_pool_op.cc
     select_input_op.cc
     select_output_op.cc)
 register_unity_group(cc
@@ -496,8 +495,7 @@ register_unity_group(cu
     scale_op.cu
     scatter_nd_add_op.cu
     scatter_op.cu
-    seed_op.cu
-    segment_pool_op.cu)
+    seed_op.cu)
 register_unity_group(cu
     roi_pool_op.cu
     selu_op.cu
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 3e11c952d15f3460f987f6fa2cb28970f97cc96b..a8ced783744a961eb8ce64983de7e9615763c1b6 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc
index bf1cdeed65a8427c19410347209faa099673cb7c..602376d54e0d2a49b6cf4f6a78d332154c188a7e 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cc
+++ b/paddle/fluid/operators/viterbi_decode_op.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input in ViterbiDecode  must be 3. But "
-                          "received Input's rank is %d.",
-                          in_dims.size()));
-    auto length_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(length_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The rank of Length in ViterbiDecode must be 1. But "
-                          "received Length's rank is %d.",
-                          length_dims.size()));
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(
-        transition_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of Transition in ViterbiDecode must be 2. But "
-            "received Transition's rank is %d.",
-            transition_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0], length_dims[0],
-          platform::errors::InvalidArgument(
-              "The batch size of Input and Length should be equal."));
-      PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The number of tags of Input (%d) and Transition "
-                            "(%d) should be equal.",
-                            transition_dims[0], in_dims[2]));
-    }
-    ctx->SetOutputDim("Scores", length_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 namespace platform = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor,
+                            PD_INFER_META(phi::ViterbiDecodeInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp,
-                             ops::ViterbiDecodeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    viterbi_decode, ops::ViterbiDecodeKernel<platform::CPUDeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CPUDeviceContext, double>);
+                             ops::ViterbiDecodeOpMaker,
+                             ViterbiDecodeInferShapeFunctor);
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
deleted file mode 100644
index 3c546dd8156a2bdffc9615d171d4630faf3bb7fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/viterbi_decode_op.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-int64_t ComputeBlockSize(int64_t col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256)
-    return 512;
-  else if (col > 128)
-    return 256;
-  else if (col > 64)
-    return 128;
-  else if (col > 32)
-    return 64;
-  else if (col > 16)
-    return 32;
-  else if (col > 8)
-    return 16;
-  else
-    return 8;
-}
-
-template <template <typename T> typename BinaryFunctor, typename T>
-struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
-    std::vector<const Tensor*> ins{&lhs, &rhs};
-    std::vector<Tensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, -1,
-                                                      BinaryFunctor<T>());
-  }
-};
-
-template <template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
-    std::vector<const Tensor*> ins = {&lhs, &rhs};
-    std::vector<Tensor*> outs = {mask};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
-  }
-};
-
-template <typename T, typename IndType, size_t BlockDim>
-__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
-                                 const int64_t width,      // c
-                                 const int64_t post_size,  // h
-                                 const T* in, IndType* out_idx, T* out) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  cub::ArgMax reducer;
-  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    cub::KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      // return max, argmax
-      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
-      if (out != nullptr) out[idx] = kv_pair.value;
-    }
-    __syncthreads();
-  }
-}
-
-__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int start = idx; idx < num; idx += gridDim.x) {
-    data[idx] = idx * scale;
-  }
-}
-
-template <>
-struct ARange<platform::CUDADeviceContext> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, int64_t* data,
-                  int num, int64_t scale) {
-    int64_t kBlockDim = ComputeBlockSize(num);
-    // kBlockDim > num at most of time, so we can set grid = 1
-    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
-  }
-};
-
-template <typename T, typename IndType>
-struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t numel = input.numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    const auto& dev_ctx = ctx.cuda_device_context();
-    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-    int64_t height = pre * post;
-    int64_t width = n;
-    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, in_data, out_idx_data, out_data));
-    }
-  }
-};
-
-template <typename T>
-struct GetMaxValue<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const Tensor& input, T* max_value) {
-    Tensor out_data;
-    out_data.Resize(phi::make_ddim({1}));
-    out_data.mutable_data<T>(platform::CUDAPlace());
-    switch (ComputeBlockSize(input.numel())) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1, input.numel(), 1, input.data<int64_t>(), nullptr,
-              out_data.data<int64_t>()));
-    }
-    Tensor max_value_tensor;
-    framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
-    *max_value = max_value_tensor.data<T>()[0];
-  }
-};
-
-template <typename T, typename IndexT>
-struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    GPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    viterbi_decode,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
deleted file mode 100644
index 8f01a0c36043b7a12f77d09c4aab0b70cdc0eccb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/unique_op.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T, typename IndType>
-struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    int64_t height = pre * post;
-    int64_t width = n;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-// Reduce
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < height; ++i) {
-      int64_t h = i / post;
-      int64_t w = i % post;
-      IndType max_idx = -1;
-      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
-      for (int64_t j = 0; j < width; ++j) {
-        if (in_data[h * width * post + j * post + w] > max_value) {
-          max_value = in_data[h * width * post + j * post + w];
-          max_idx = j;
-        }
-      }
-      out_data[i] = max_value;
-      out_idx_data[i] = max_idx;
-    }
-  }
-};
-
-template <typename DeviceContext>
-struct ARange {
-  void operator()(const DeviceContext& dev_ctx, int64_t* data, int end,
-                  int64_t scale) {
-    for (int i = 0; i < end; ++i) {
-      data[i] = i * scale;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& input,
-                  T* max_value) {
-    auto input_ptr = input.data<T>();
-    auto num = input.numel();
-    *max_value = *std::max_element(input_ptr, input_ptr + num);
-  }
-};
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-struct Gather {
-  void operator()(const DeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    CPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  OutT* out_ptr = out->data<OutT>();
-  Functor functor;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
-  }
-}
-
-template <typename DeviceContext,
-          template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
-    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
-  }
-};
-
-template <bool is_multi_threads>
-struct GetInputIndex {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    for (int j = 0; j < out_dims_size; ++j) {
-      int curr_idx = output_idx / output_strides[j];
-      output_idx %= output_strides[j];
-      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
-      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
-    }
-  }
-};
-
-template <>
-struct GetInputIndex<false> {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    *lhs_idx =
-        phi::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array);
-    *rhs_idx =
-        phi::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array);
-    phi::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                     index_array);
-  }
-};
-
-template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
-                             Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  T* out_ptr = out->data<T>();
-  int out_size = static_cast<int>(out->dims().size());
-  std::vector<int> out_dims(out_size);
-  std::vector<int> lhs_dims(out_size);
-  std::vector<int> rhs_dims(out_size);
-  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
-  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
-  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
-  std::vector<int> output_strides(out_size, 1);
-  std::vector<int> lhs_strides(out_size, 1);
-  std::vector<int> rhs_strides(out_size, 1);
-  std::vector<int> index_array(out_size, 0);
-  // calculate strides
-  for (int i = out_size - 2; i >= 0; --i) {
-    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
-    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
-    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
-  }
-  Functor functor;
-  GetInputIndex<is_multi_threads> get_input_index;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    int lhs_idx = 0;
-    int rhs_idx = 0;
-    get_input_index(lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides,
-                    output_strides, i, index_array.data(), &lhs_idx, &rhs_idx);
-    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
-  }
-}
-
-template <typename DeviceContext, template <typename T> typename BinaryFunctor,
-          typename T>
-struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
-    if (lhs.dims() == rhs.dims()) {
-      SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
-    } else {
-      bool is_multi_threads = false;
-#ifdef PADDLE_WITH_MKLML
-      if (omp_get_max_threads() > 1) {
-        is_multi_threads = true;
-      }
-#endif
-      if (is_multi_threads) {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(lhs, rhs, output);
-      } else {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(lhs, rhs, output);
-      }
-    }
-  }
-};
-
-class TensorBuffer {
- public:
-  explicit TensorBuffer(const LoDTensor& in) : buffer_(in), offset_(0) {
-    buffer_.Resize({buffer_.numel()});
-  }
-  Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
-    int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                   std::multiplies<int64_t>());
-    Tensor block = buffer_.Slice(offset_, offset_ + size);
-    offset_ += size;
-    block.Resize(shape);
-    return block;
-  }
-
- private:
-  LoDTensor buffer_;  // need to resize 1-D Tensor
-  int offset_;
-};
-
-template <typename DeviceContext, typename T>
-class ViterbiDecodeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<Tensor>("Input");
-    auto batch_size = static_cast<int>(input->dims()[0]);
-    auto seq_len = static_cast<int>(input->dims()[1]);
-    auto n_labels = static_cast<int>(input->dims()[2]);
-    phi::funcs::SetConstant<DeviceContext, T> float_functor;
-    phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<Tensor> historys;
-    // We create tensor buffer in order to avoid allocating memory frequently
-    // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-    int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    LoDTensor int_buffer;
-    int_buffer.Resize(phi::make_ddim({buffer_size}));
-    int_buffer.mutable_data<int64_t>(ctx.GetPlace());
-    TensorBuffer int_tensor_buffer(int_buffer);
-    // create float tensor buffer
-    // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
-    buffer_size = batch_size * (seq_len + 10) * n_labels +
-                  (batch_size + 2) * n_labels * n_labels;
-    LoDTensor float_buffer;
-    float_buffer.Resize(phi::make_ddim({buffer_size}));
-    float_buffer.mutable_data<T>(ctx.GetPlace());
-    TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<Tensor>("Length");
-    Tensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
-    int64_t max_seq_len = 0;
-    GetMaxValue<DeviceContext, int64_t> get_max_value;
-    get_max_value(dev_ctx, left_length, &max_seq_len);
-
-    auto* scores = ctx.Output<Tensor>("Scores");
-    scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<Tensor>("Path");
-    path->Resize({batch_size, max_seq_len});
-    path->mutable_data<int64_t>(curr_place);
-    Tensor tpath = int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
-    auto batch_path = Unbind(tpath);
-    for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
-      it->Resize({batch_size});
-    }
-    // create and init required tensor
-    Tensor input_exp =
-        float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<Tensor>("Transition");
-    Tensor trans_exp = float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
-    framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
-    trans_exp.Resize({1, n_labels, n_labels});
-    Tensor alpha = float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &zero, 0);
-    Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &one, 1);
-    Tensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    Tensor alpha_trn_sum =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    Tensor alpha_max =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor alpha_argmax =
-        int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    Tensor alpha_nxt =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    Tensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor start_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor rest_trans =
-        float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const Tensor*> shape{&rest_trans, &stop_trans, &start_trans};
-    std::vector<Tensor*> outputs{&rest_trans, &stop_trans, &start_trans};
-    math::SplitFunctor<DeviceContext, T> split_functor;
-    split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
-    stop_trans.Resize({1, n_labels});
-    start_trans.Resize({1, n_labels});
-    auto logit0 = input_exp.Slice(0, 1);
-    logit0.Resize({batch_size, n_labels});
-    BinaryOperation<DeviceContext, AddFunctor, T> AddFloat;
-    BinaryOperation<DeviceContext, AddFunctor, int64_t> AddInt;
-    BinaryOperation<DeviceContext, MulFunctor, T> MulFloat;
-    BinaryOperation<DeviceContext, MulFunctor, int64_t> MulInt;
-    BinaryOperation<DeviceContext, SubFunctor, T> SubFloat;
-    BinaryOperation<DeviceContext, SubFunctor, int64_t> SubInt;
-    if (include_bos_eos_tag) {
-      AddFloat(dev_ctx, logit0, start_trans, &alpha);
-      GetMask<DeviceContext, EqualFunctor, T>()(ctx, left_length, one,
-                                                &float_mask);
-      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-    } else {
-      alpha = logit0;
-    }
-    SubInt(dev_ctx, left_length, one, &left_length);
-    Argmax<DeviceContext, T, int64_t> argmax;
-    for (int64_t i = 1; i < max_seq_len; ++i) {
-      Tensor logit = input_exp.Slice(i, i + 1);
-      logit.Resize({batch_size, n_labels});
-      Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
-      AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
-      auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
-      alpha_argmax_temp.Resize({batch_size, n_labels});
-      argmax(ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
-      historys.emplace_back(alpha_argmax_temp);
-      AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
-      alpha.Resize({batch_size, n_labels});
-      // mask = paddle.cast((left_length > 0), dtype='float32')
-      // alpha = mask * alpha_nxt + (1 - mask) * alpha
-      GetMask<DeviceContext, GreaterThanFunctor, T>()(ctx, left_length, zero,
-                                                      &float_mask);
-      // alpha_nxt = mask * alpha_nxt
-      MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
-      // inv_mask = 1 - mask
-      SubFloat(dev_ctx, float_one, float_mask, &float_mask);
-      // alpha = (1 - mask) * alpha
-      MulFloat(dev_ctx, alpha, float_mask, &alpha);
-      // alpha += alpha_nxt
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      if (include_bos_eos_tag) {
-        GetMask<DeviceContext, EqualFunctor, T>()(ctx, left_length, one,
-                                                  &float_mask);
-        // alpha += mask * trans_exp[:, self.stop_idx]
-        MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-        AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      }
-      SubInt(dev_ctx, left_length, one, &left_length);
-    }
-    argmax(ctx, alpha, &last_ids, scores, 1);
-    left_length.Resize({batch_size});
-    GetMask<DeviceContext, GreaterEqualFunctor, int64_t>()(ctx, left_length,
-                                                           zero, &int_mask);
-    // last_ids_update = last_ids * tag_mask
-    int last_ids_index = 1;
-    int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
-    MulInt(dev_ctx, last_ids, int_mask,
-           &batch_path[actual_len - last_ids_index]);
-    // The algorithm below can refer to
-    // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
-    ARange<DeviceContext> arange;
-    arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
-    Gather<DeviceContext, int64_t, int64_t> gather;
-    for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
-      ++last_ids_index;
-      AddInt(dev_ctx, left_length, one, &left_length);
-      AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      Tensor& last_ids_update = batch_path[actual_len - last_ids_index];
-      hist->Resize({batch_size * n_labels});
-      gather(dev_ctx, *hist, gather_idx, &last_ids_update);
-      GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,
-                                                            zero, &int_mask);
-      MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
-      GetMask<DeviceContext, EqualFunctor, int64_t>()(ctx, left_length, zero,
-                                                      &zero_len_mask);
-      MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
-      SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
-      MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
-      AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
-      GetMask<DeviceContext, LessThanFunctor, int64_t>()(ctx, left_length, zero,
-                                                         &int_mask);
-      MulInt(dev_ctx, last_ids, int_mask, &last_ids);
-      AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
-    }
-    TransCompute<DeviceContext, int64_t>(2, dev_ctx, tpath, path, {1, 0});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op.cc b/paddle/fluid/operators/where_index_op.cc
index 2bffeb500ce50e3bc5a3d72a085da826d06e849d..733d0f7af92d727bd3eff31a87e7e88b3d073829 100644
--- a/paddle/fluid/operators/where_index_op.cc
+++ b/paddle/fluid/operators/where_index_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +24,6 @@ class WhereIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "where");
-    PADDLE_ENFORCE_GE(
-        ctx->GetInputDim("Condition").size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input(Condition) should have number of dimension at least 1"));
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "where");
-    ctx->SetOutputDim("Out", {-1, ctx->GetInputDim("Condition").size()});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -53,11 +46,10 @@ class WhereIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(where_index, ops::WhereIndexOp,
-                             ops::WhereIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(where_index, ops::CPUWhereIndexKernel<int64_t>,
-                       ops::CPUWhereIndexKernel<int>,
-                       ops::CPUWhereIndexKernel<int16_t>,
-                       ops::CPUWhereIndexKernel<bool>,
-                       ops::CPUWhereIndexKernel<float>,
-                       ops::CPUWhereIndexKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(where_index, WhereIndexInferShapeFunctor,
+                            PD_INFER_META(phi::WhereIndexInferMeta));
+REGISTER_OPERATOR(
+    where_index, ops::WhereIndexOp, ops::WhereIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    WhereIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
deleted file mode 100644
index c594e478aa0f3cb36b2bb63bdd1dc22e87613bf0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_index_op.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
-  }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
-                             const int64_t numel, const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *condition = context.Input<framework::Tensor>("Condition");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
-
-    const T *cond_data = condition->data<T>();
-    const int64_t numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-    auto h_array_mem =
-        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-    // "stride_array" is an array and len(stride_array)==rank,
-    // each element is the stride of each dimension -- the length from i to i+1.
-    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-    // "true_num_array" is an array and len(stride_array)==numel,
-    // at the beginning,
-    // "true_num_array" will set 1 if condition[i] == true else 0,
-    // then it will be calculated by cub::InclusiveSum,
-    // so that we can get the true number before i as the out index
-    int64_t *d_true_num_array = d_stride_array + rank;
-
-    // the total_true_num is the total number of condition[i] == true
-    int64_t *h_total_true_num = h_stride_array + rank;
-
-    // alloce cub memory
-    size_t cub_size = 0;
-    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-    void *cub_data = cub_mem->ptr();
-
-    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-    const int threads = std::min(numel, static_cast<int64_t>(128));
-    const int64_t need_grids = (numel + threads - 1) / threads;
-    const int grids = std::min(need_grids, static_cast<int64_t>(256));
-    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
-                                                           d_true_num_array);
-
-    // calculate the inclusive prefix sum of "true_num_array"
-    // to get the index of "out" tensor,
-    // and the total number of cond_data[i]==true.
-    // Example:
-    // condition: F T T F F F T T
-    // before:    0 1 1 0 0 0 1 1
-    // after:     0 1 2 2 2 2 3 4
-    // out:       1 2 6 7
-    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-
-    // calculate each dimension's stride
-    h_stride_array[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-    }
-    memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(),
-                 h_stride_array, rank * sizeof(int64_t), dev_ctx.stream());
-
-    // get total ture number and set output size
-    // the last element of cub::InclusiveSum is the total number
-    memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(),
-                 d_true_num_array + numel - 1, sizeof(int64_t),
-                 dev_ctx.stream());
-    dev_ctx.Wait();
-
-    int64_t true_num = *h_total_true_num;
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    // using true_num_array and stride_array to calculate the output index
-    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(where_index, ops::CUDAWhereIndexKernel<int64_t>,
-                        ops::CUDAWhereIndexKernel<int>,
-                        ops::CUDAWhereIndexKernel<int16_t>,
-                        ops::CUDAWhereIndexKernel<bool>,
-                        ops::CUDAWhereIndexKernel<float>,
-                        ops::CUDAWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
deleted file mode 100644
index 193a2386e6bd1eb19d30b7c9e146eb8b77b8e851..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_index_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct WhereIndexFunctor {
-  WhereIndexFunctor(const T* true_index, int true_num, const T* stride,
-                    int rank, T* out)
-      : true_index_(true_index),
-        true_num_(true_num),
-        stride_(stride),
-        rank_(rank),
-        out_ptr_(out) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T index = true_index_[idx];
-    for (int j = 0; j < rank_; j++) {
-      out_ptr_[idx * rank_ + j] = index / stride_[j];
-      index -= out_ptr_[idx * rank_ + j] * stride_[j];
-    }
-  }
-
-  const T* true_index_;
-  int true_num_;
-  const T* stride_;
-  int rank_;
-  T* out_ptr_;
-};
-
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
-
-template <typename T>
-class CPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    std::vector<int64_t> true_index;
-    for (auto i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        true_index.push_back(i);
-      }
-    }
-    auto true_num = true_index.size();
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    std::vector<int64_t> stride(rank);
-    stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      stride[i] = stride[i + 1] * dims[i + 1];
-    }
-
-    auto& dev_ctx = context.template device_context<CPUDeviceContext>();
-    WhereIndexFunctor<int64_t> functor(true_index.data(), true_num,
-                                       stride.data(), rank, out_ptr);
-    platform::ForRange<CPUDeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 59f598d2ad6a3275158cadf32bd1bf2086a3487a..2f8744c2c0448881901656102f0ee65279f159a2 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc
deleted file mode 100644
index 3322eefd887e3d4dce5363cca842305931822a23..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/where_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class WhereIndexXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
-    int true_num_cpu;
-    int ret =
-        xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
-            ret, XPUAPIErrorMsg[ret]));
-
-    memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
-                 context.GetPlace(), static_cast<void*>(true_num),
-                 sizeof(int32_t));
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-    if (true_num_cpu == 0) {
-      return;
-    }
-
-    auto condition_shape = phi::vectorize<int>(dims);
-    ret = xpu::where(dev_ctx.x_context(), cond_data, out_data, condition_shape,
-                     true_num_cpu);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU masked_select kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(where_index, ops::WhereIndexXPUKernel<int>,
-                       ops::WhereIndexXPUKernel<bool>,
-                       ops::WhereIndexXPUKernel<float>);
-#endif
diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
index 92ed2bbdc33f55315b3dddf8dc106b7716e97a6f..acbfee30670b120c292ab802d8d178295e4a4b57 100644
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/where_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 namespace paddle {
 namespace operators {
 
@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
-
-    auto cond_dims = ctx->GetInputDim("Condition");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        cond_dims, x_dims,
-        platform::errors::InvalidArgument(
-            "The dims of Inputs(Condition) and Inputs(X) should be same. "
-            "But received Condition's shape is [%s], X's shape is [%s]",
-            cond_dims, x_dims));
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      platform::errors::InvalidArgument(
-                          "The dims of Inputs(X) and Inputs(Y) should be same. "
-                          "But received X's shape is [%s], Y's shape is [%s]",
-                          x_dims, y_dims));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PD_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                   ops::WhereOpGradMaker<paddle::framework::OpDesc>,
-                  ops::WhereOpGradMaker<paddle::imperative::OpBase>);
+                  ops::WhereOpGradMaker<paddle::imperative::OpBase>,
+                  WhereInferShapeFunctor);
 
 REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
                   ops::WhereGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
deleted file mode 100644
index 61a1691e4fe265035917ed2407d5e3e24aa6bd88..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace platform = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CondFunctor {
-  HOSTDEVICE inline CondFunctor() {}
-
-  HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
-    return cond ? x : y;
-  }
-};
-
-template <typename T>
-__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
-                                const T* y, T* out) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out[idx] = cond[idx] ? x[idx] : y[idx];
-  }
-}
-
-template <typename T>
-__global__ void WhereGradCUDAKernel(const int N, const T* dout,
-                                    const bool* cond, T* dx, T* dy) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    if (dx != nullptr) {
-      dx[idx] = cond[idx] ? dout[idx] : 0.;
-    }
-    if (dy != nullptr) {
-      dy[idx] = cond[idx] ? 0. : dout[idx];
-    }
-  }
-}
-
-template <typename T>
-class WhereKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto numel = condition->numel();
-
-    // TODO(GaaoWei8): Input of where can be broadcast
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto functor = CondFunctor<T>();
-    std::vector<const framework::Tensor*> ins = {condition, X, Y};
-    std::vector<framework::Tensor*> outs = {out};
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-template <typename T>
-class WhereGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    const bool* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dout = dout_t->data<T>();
-    T* dx =
-        (dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dy =
-        (dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
-    WhereGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        numel, dout, cond_data, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    where_grad,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h
deleted file mode 100644
index 5398ee024a2890e38e88fc981721872e1ba34d60..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class WhereKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto x_numel = X->numel();
-    for (int i = 0; i < x_numel; i++) {
-      out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WhereGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::LoDTensor>("Condition");
-    const auto* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    auto* dout = dout_t->data<T>();
-    if (dx_t != nullptr) {
-      auto* dx = dx_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
-      }
-    }
-    if (dy_t != nullptr) {
-      auto* dy = dy_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc
index d4294393daa34612aae815b0ebfab7d55f0b9f46..35508950941783753734a916aa7c2dcff7731181 100755
--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/where_op_xpu.cc b/paddle/fluid/operators/where_op_xpu.cc
index 3a4875c07005119e90f5d5cb448a63bcf62a09a4..41232c8b5e8d88564e59e0343a26a4ae98d5ed90 100644
--- a/paddle/fluid/operators/where_op_xpu.cc
+++ b/paddle/fluid/operators/where_op_xpu.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index ecad5340d71c1ae32339ab1c79bf37d947402747..cbf3fdd263b488e40a0bd94b9c12f4cb45a29d22 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,15 +1,3 @@
-IF(WITH_CUSTOM_DEVICE)
-cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
-
-cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
-
-cc_library(stream SRCS stream.cc DEPS callback_manager)
-
-cc_library(event SRCS event.cc DEPS enforce place)
-
-cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
-
-ENDIF()
 
 set(DEV_LIBS custom_device)
 
@@ -37,11 +25,3 @@ ENDIF()
 IF(WITH_MLU)
   add_subdirectory(mlu)
 ENDIF()
-
-# CUSTOM
-IF(WITH_CUSTOM_DEVICE)
-  add_subdirectory(custom)
-
-  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
-  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
-ENDIF()
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
deleted file mode 100644
index f39c60c0c68edcdaca4bd4a0b25a9ec07453280e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-IF(WITH_CUSTOM_DEVICE)
-cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
-cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
-ENDIF()
diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h
index fbdb4627aba2662a2a12cc933a3a4c6e61aa55d5..ba92b4ac7deaecda8ed6553651ef2f80510bea70 100644
--- a/paddle/fluid/platform/device/custom/enforce_custom.h
+++ b/paddle/fluid/platform/device/custom/enforce_custom.h
@@ -14,7 +14,10 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_ext.h"
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/device_ext.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index ba3461d8c14871561b2d069f9350698306e22366..6803a39a4fd7fd3099aa96e685b5b68acf96750e 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -40,10 +40,10 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/callback_manager.h"
 #include "paddle/fluid/platform/device/custom/enforce_custom.h"
-#include "paddle/fluid/platform/device/device_guard.h"
-#include "paddle/fluid/platform/device/device_manager.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 #endif
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index ab7d474c1ac38eee83addfba09eca11601c05976..a32db3a9921e3db0f9eee933b9f98264050d695b 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -110,5 +111,28 @@ class CublasHandleHolder {
   mutable std::mutex mtx_;
 };
 
+class CublasLtHandleHolder {
+ public:
+  CublasLtHandleHolder() {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtCreate(&handle_));
+  }
+  const cublasLtHandle_t& GetCublasLtHandle() const { return handle_; }
+
+  ~CublasLtHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtDestroy(handle_));
+  }
+
+  inline void Call(const std::function<void(blasLtHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasLtHandleHolder);
+
+  cublasLtHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 8aec8e840f33273a3130355c751e635e4a3f6736..803674779e756f000005d106f950659ea765c5ce 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
 #endif
 #endif
 
+// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
+inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
+  bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
+  bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
+  return *reinterpret_cast<bfloat16 *>(&x);
+}
+
+static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
+  return *reinterpret_cast<__nv_bfloat16 *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
+                                    PDBF16ToCUDABF16(val)));
+}
+#else
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
   float *imag = real + 1;
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index d7362fe9cbd81d65299987da32067c2ad54084ce..d0b48eca5021bdd971eecd38642cc780e9d6a6bb 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +25,7 @@
 #else
 #include <cuda_runtime.h>
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
 
@@ -70,6 +72,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index 34b9d57e055d57533a02466d17d83e26ddaa40d9..1a514d2aca2675932396fede6d22f4962e4e0d76 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -140,6 +140,23 @@ class CudnnDataType<float16> {
   }
 };
 
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+  static const miopenDataType_t type = miopenBFloat16;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
 template <>
 class CudnnDataType<float> {
  public:
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 943dfcc6cffb875fc3cebfc88e35adeaba47fd63..e806b0b30e4e03759847cc2e1838171020a064b1 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -120,121 +120,151 @@ IpuStrategy::IpuStrategy() {
   RegisterGetter(options_getter, options_type, #name, "string",         \
                  [&]() { return popart_options.aliased_name; })
 
-#define ADD_POPART_ENUM_OPTION(name, EnumType) \
-  ADD_POPART_ENUM_OPTION_ALIAS(name, name, EnumType)
-
-#define ADD_POPART_BOOL_OPTION(name) ADD_POPART_BOOL_OPTION_ALIAS(name, name)
-
-#define ADD_POPART_UINT64_OPTION(name) \
-  ADD_POPART_UINT64_OPTION_ALIAS(name, name)
-
-#define ADD_POPART_DOUBLE_OPTION(name) \
-  ADD_POPART_DOUBLE_OPTION_ALIAS(name, name)
-
-#define ADD_POPART_STRING_OPTION(name) \
-  ADD_POPART_STRING_OPTION_ALIAS(name, name)
-
-  ADD_POPART_ENUM_OPTION(autodiffSettings.stitchStrategy,
-                         AutodiffStitchStrategy);
-  ADD_POPART_ENUM_OPTION(batchSerializationSettings.transformContext,
-                         BatchSerializationTransformContext);
-  ADD_POPART_ENUM_OPTION(batchSerializationSettings.method,
-                         BatchSerializationMethod);
-  ADD_POPART_ENUM_OPTION(batchSerializationSettings.batchSchedule,
-                         BatchSerializationBatchSchedule);
-  ADD_POPART_ENUM_OPTION(autoRecomputation, RecomputationType);
-  ADD_POPART_ENUM_OPTION(mergeVarUpdate, MergeVarUpdateType);
-  ADD_POPART_ENUM_OPTION(virtualGraphMode, VirtualGraphMode);
-  ADD_POPART_ENUM_OPTION(syntheticDataMode, SyntheticDataMode);
-  ADD_POPART_ENUM_OPTION(subgraphCopyingStrategy, SubgraphCopyingStrategy);
-  ADD_POPART_ENUM_OPTION(accumulationAndReplicationReductionType,
-                         ReductionType);
-  ADD_POPART_ENUM_OPTION(meanAccumulationAndReplicationReductionStrategy,
-                         MeanReductionStrategy);
-
-  ADD_POPART_STRING_OPTION(logDir);
-  ADD_POPART_STRING_OPTION(cachePath);
-  ADD_POPART_STRING_OPTION(partialsTypeMatMuls);
-  ADD_POPART_STRING_OPTION(customCodeletCompileFlags);
-  ADD_POPART_STRING_OPTION(serializedPoprithmsShiftGraphsDir);
-  ADD_POPART_STRING_OPTION(kahnTieBreaker);
-
-  ADD_POPART_UINT64_OPTION(executionPhaseSettings.phases);
-  ADD_POPART_UINT64_OPTION(executionPhaseSettings.stages);
-  ADD_POPART_UINT64_OPTION(batchSerializationSettings.factor);
-  ADD_POPART_UINT64_OPTION(firstDotOp);
-  ADD_POPART_UINT64_OPTION(finalDotOp);
-  ADD_POPART_UINT64_OPTION(numIOTiles);
-  ADD_POPART_UINT64_OPTION(mergeVarUpdateMemThreshold);
-  ADD_POPART_UINT64_OPTION(looseThresholdAtPeak);
-  ADD_POPART_UINT64_OPTION(accumulationFactor);
-  ADD_POPART_UINT64_OPTION(swapLimitScheduler);
-  ADD_POPART_UINT64_OPTION(globalReplicationFactor);
-  ADD_POPART_UINT64_OPTION(globalReplicaOffset);
-  ADD_POPART_UINT64_OPTION(defaultPrefetchBufferingDepth);
-  ADD_POPART_UINT64_OPTION(compilationProgressTotal);
-  ADD_POPART_UINT64_OPTION(transitiveClosureOptimizationThreshold);
-
-  ADD_POPART_BOOL_OPTION(batchSerializationSettings.concatOnVirtualGraphChange);
-  ADD_POPART_BOOL_OPTION(
+  ADD_POPART_ENUM_OPTION_ALIAS(autodiff_settings.stitch_strategy,
+                               autodiffSettings.stitchStrategy,
+                               AutodiffStitchStrategy);
+  ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.transform_context,
+                               batchSerializationSettings.transformContext,
+                               BatchSerializationTransformContext);
+  ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.method,
+                               batchSerializationSettings.method,
+                               BatchSerializationMethod);
+  ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.batch_schedule,
+                               batchSerializationSettings.batchSchedule,
+                               BatchSerializationBatchSchedule);
+  ADD_POPART_ENUM_OPTION_ALIAS(auto_recomputation, autoRecomputation,
+                               RecomputationType);
+  ADD_POPART_ENUM_OPTION_ALIAS(merge_var_update, mergeVarUpdate,
+                               MergeVarUpdateType);
+  ADD_POPART_ENUM_OPTION_ALIAS(virtual_graph_mode, virtualGraphMode,
+                               VirtualGraphMode);
+  ADD_POPART_ENUM_OPTION_ALIAS(synthetic_data_mode, syntheticDataMode,
+                               SyntheticDataMode);
+  ADD_POPART_ENUM_OPTION_ALIAS(subgraph_copying_strategy,
+                               subgraphCopyingStrategy,
+                               SubgraphCopyingStrategy);
+  ADD_POPART_ENUM_OPTION_ALIAS(accumulation_and_replication_reduction_type,
+                               accumulationAndReplicationReductionType,
+                               ReductionType);
+  ADD_POPART_ENUM_OPTION_ALIAS(
+      mean_accumulation_and_replication_reduction_strategy,
+      meanAccumulationAndReplicationReductionStrategy, MeanReductionStrategy);
+
+  ADD_POPART_STRING_OPTION_ALIAS(log_dir, logDir);
+  ADD_POPART_STRING_OPTION_ALIAS(cache_path, cachePath);
+  ADD_POPART_STRING_OPTION_ALIAS(partials_type_matmuls, partialsTypeMatMuls);
+  ADD_POPART_STRING_OPTION_ALIAS(custom_codelet_compile_flags,
+                                 customCodeletCompileFlags);
+  ADD_POPART_STRING_OPTION_ALIAS(serialized_poprithms_shift_graphs_dir,
+                                 serializedPoprithmsShiftGraphsDir);
+  ADD_POPART_STRING_OPTION_ALIAS(kahn_tie_breaker, kahnTieBreaker);
+
+  ADD_POPART_UINT64_OPTION_ALIAS(execution_phase_settings.phases,
+                                 executionPhaseSettings.phases);
+  ADD_POPART_UINT64_OPTION_ALIAS(execution_phase_settings.stages,
+                                 executionPhaseSettings.stages);
+  ADD_POPART_UINT64_OPTION_ALIAS(batch_serialization_settings.factor,
+                                 batchSerializationSettings.factor);
+  ADD_POPART_UINT64_OPTION_ALIAS(first_dot_op, firstDotOp);
+  ADD_POPART_UINT64_OPTION_ALIAS(final_dot_op, finalDotOp);
+  ADD_POPART_UINT64_OPTION_ALIAS(num_io_tiles, numIOTiles);
+  ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold,
+                                 mergeVarUpdateMemThreshold);
+  ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak);
+  ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor);
+  ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler);
+  ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor,
+                                 globalReplicationFactor);
+  ADD_POPART_UINT64_OPTION_ALIAS(global_replica_offset, globalReplicaOffset);
+  ADD_POPART_UINT64_OPTION_ALIAS(default_prefetch_buffering_depth,
+                                 defaultPrefetchBufferingDepth);
+  ADD_POPART_UINT64_OPTION_ALIAS(compilation_progress_total,
+                                 compilationProgressTotal);
+  ADD_POPART_UINT64_OPTION_ALIAS(transitive_closure_optimization_threshold,
+                                 transitiveClosureOptimizationThreshold);
+
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      batch_serialization_settings.concat_on_virtual_graph_change,
+      batchSerializationSettings.concatOnVirtualGraphChange);
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      batch_serialization_settings.concat_on_execution_phase_change,
       batchSerializationSettings.concatOnExecutionPhaseChange);
-  ADD_POPART_BOOL_OPTION(
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      batch_serialization_settings.concat_on_pipeline_stage_change,
       batchSerializationSettings.concatOnPipelineStageChange);
-  ADD_POPART_BOOL_OPTION(strictOpVersions);
-  ADD_POPART_BOOL_OPTION(opxAliasChecking);
-  ADD_POPART_BOOL_OPTION(opxModifyChecking);
-  ADD_POPART_BOOL_OPTION(dotOpNames);
-  ADD_POPART_BOOL_OPTION(exportPoplarComputationGraph);
-  ADD_POPART_BOOL_OPTION(exportPoplarVertexGraph);
-  ADD_POPART_BOOL_OPTION(separateCallOpPdfs);
-  ADD_POPART_BOOL_OPTION(enableOutlining);
-  ADD_POPART_BOOL_OPTION(enableOutliningCopyCostPruning);
-  ADD_POPART_BOOL_OPTION(rearrangeAnchorsOnHost);
-  ADD_POPART_BOOL_OPTION(enablePrefetchDatastreams);
-  ADD_POPART_BOOL_OPTION(enableNonStableSoftmax);
-  ADD_POPART_BOOL_OPTION(enableReplicatedGraphs);
-  ADD_POPART_BOOL_OPTION(enableGradientAccumulation);
-  ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter);
-  ADD_POPART_BOOL_OPTION(enablePipelining);
+  ADD_POPART_BOOL_OPTION_ALIAS(strict_op_versions, strictOpVersions);
+  ADD_POPART_BOOL_OPTION_ALIAS(opx_alias_checking, opxAliasChecking);
+  ADD_POPART_BOOL_OPTION_ALIAS(opx_modify_checking, opxModifyChecking);
+  ADD_POPART_BOOL_OPTION_ALIAS(dot_op_names, dotOpNames);
+  ADD_POPART_BOOL_OPTION_ALIAS(export_poplar_computation_graph,
+                               exportPoplarComputationGraph);
+  ADD_POPART_BOOL_OPTION_ALIAS(export_poplar_vertex_graph,
+                               exportPoplarVertexGraph);
+  ADD_POPART_BOOL_OPTION_ALIAS(separate_call_op_pdfs, separateCallOpPdfs);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_outlining, enableOutlining);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_outlining_copy_cost_pruning,
+                               enableOutliningCopyCostPruning);
+  ADD_POPART_BOOL_OPTION_ALIAS(rearrange_anchors_on_host,
+                               rearrangeAnchorsOnHost);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_prefetch_datastreams,
+                               enablePrefetchDatastreams);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_non_stable_softmax,
+                               enableNonStableSoftmax);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_replicated_graphs,
+                               enableReplicatedGraphs);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_gradient_accumulation,
+                               enableGradientAccumulation);
+  ADD_POPART_BOOL_OPTION_ALIAS(instrument_with_hardware_cycle_counter,
+                               instrumentWithHardwareCycleCounter);
   ADD_POPART_BOOL_OPTION_ALIAS(enable_pipelining, enablePipelining);
-  ADD_POPART_BOOL_OPTION(disableGradAccumulationTensorStreams);
-  ADD_POPART_BOOL_OPTION(compileEngine);
-  ADD_POPART_BOOL_OPTION(constantWeights);
-  ADD_POPART_BOOL_OPTION(enableEngineCaching);
-  ADD_POPART_BOOL_OPTION(enableMergeExchange);
-  ADD_POPART_BOOL_OPTION(enableFloatingPointChecks);
-  ADD_POPART_BOOL_OPTION(enableStochasticRounding);
+  ADD_POPART_BOOL_OPTION_ALIAS(disable_grad_accumulation_tensor_streams,
+                               disableGradAccumulationTensorStreams);
+  ADD_POPART_BOOL_OPTION_ALIAS(compile_engine, compileEngine);
+  ADD_POPART_BOOL_OPTION_ALIAS(constant_weights, constantWeights);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_engine_caching, enableEngineCaching);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_merge_exchange, enableMergeExchange);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_floating_point_checks,
+                               enableFloatingPointChecks);
   ADD_POPART_BOOL_OPTION_ALIAS(enable_stochastic_rounding,
                                enableStochasticRounding);
-  ADD_POPART_BOOL_OPTION(explicitRecomputation);
-  ADD_POPART_BOOL_OPTION(enableExplicitMainLoops);
-  ADD_POPART_BOOL_OPTION(useHostCopyOps);
-  ADD_POPART_BOOL_OPTION(aliasZeroCopy);
-  ADD_POPART_BOOL_OPTION(delayVarUpdates);
-  ADD_POPART_BOOL_OPTION(enableFullyConnectedPass);
-  ADD_POPART_BOOL_OPTION(enableSerializedMatmuls);
-  ADD_POPART_BOOL_OPTION(enableStableNorm);
-  ADD_POPART_BOOL_OPTION(decomposeGradSum);
-  ADD_POPART_BOOL_OPTION(enableDistributedReplicatedGraphs);
-  ADD_POPART_BOOL_OPTION(groupHostSync);
-  ADD_POPART_BOOL_OPTION(automaticLossScalingSettings.enabled);
-  ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter);
-  ADD_POPART_BOOL_OPTION(enableSupportedDataTypeCasting);
-  ADD_POPART_BOOL_OPTION(groupNormStridedChannelGrouping);
-  ADD_POPART_BOOL_OPTION(scheduleNonWeightUpdateGradientConsumersEarly);
-
-  ADD_POPART_DOUBLE_OPTION(outlineSequenceBreakCost);
-  ADD_POPART_DOUBLE_OPTION(outlineThreshold);
-  ADD_POPART_DOUBLE_OPTION(timeLimitScheduler);
-  ADD_POPART_DOUBLE_OPTION(automaticLossScalingSettings.binEdgeLocation);
-  ADD_POPART_DOUBLE_OPTION(
+  ADD_POPART_BOOL_OPTION_ALIAS(explicit_recomputation, explicitRecomputation);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_explicit_main_loops,
+                               enableExplicitMainLoops);
+  ADD_POPART_BOOL_OPTION_ALIAS(use_host_copy_ops, useHostCopyOps);
+  ADD_POPART_BOOL_OPTION_ALIAS(alias_zero_copy, aliasZeroCopy);
+  ADD_POPART_BOOL_OPTION_ALIAS(delay_var_updates, delayVarUpdates);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_fully_connected_pass,
+                               enableFullyConnectedPass);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_serialized_matmuls,
+                               enableSerializedMatmuls);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_stable_norm, enableStableNorm);
+  ADD_POPART_BOOL_OPTION_ALIAS(decompose_grad_sum, decomposeGradSum);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_distributed_replicated_graphs,
+                               enableDistributedReplicatedGraphs);
+  ADD_POPART_BOOL_OPTION_ALIAS(group_host_sync, groupHostSync);
+  ADD_POPART_BOOL_OPTION_ALIAS(automatic_loss_scaling_settings.enabled,
+                               automaticLossScalingSettings.enabled);
+  ADD_POPART_BOOL_OPTION_ALIAS(instrument_with_hardware_cycle_counter,
+                               instrumentWithHardwareCycleCounter);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_supported_data_type_casting,
+                               enableSupportedDataTypeCasting);
+  ADD_POPART_BOOL_OPTION_ALIAS(group_norm_strided_channel_grouping,
+                               groupNormStridedChannelGrouping);
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      schedule_non_weight_update_gradient_consumers_early,
+      scheduleNonWeightUpdateGradientConsumersEarly);
+
+  ADD_POPART_DOUBLE_OPTION_ALIAS(outline_sequence_break_cost,
+                                 outlineSequenceBreakCost);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(outline_threshold, outlineThreshold);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(time_limit_scheduler, timeLimitScheduler);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(
+      automatic_loss_scaling_settings.bin_edge_location,
+      automaticLossScalingSettings.binEdgeLocation);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(
+      automatic_loss_scaling_settings.threshold_upper_count_proportion,
       automaticLossScalingSettings.thresholdUpperCountProportion);
 
-#undef ADD_POPART_STRING_OPTION
-#undef ADD_POPART_DOUBLE_OPTION
-#undef ADD_POPART_UINT64_OPTION
-#undef ADD_POPART_BOOL_OPTION
-#undef ADD_POPART_ENUM_OPTION
 #undef ADD_POPART_STRING_OPTION_ALIAS
 #undef ADD_POPART_DOUBLE_OPTION_ALIAS
 #undef ADD_POPART_UINT64_OPTION_ALIAS
@@ -278,14 +308,14 @@ IpuStrategy::IpuStrategy() {
       });
 
   RegisterSetter(
-      container_options, "dotChecks",
+      container_options, "dot_checks",
       [&](const std::pair<std::string, std::string>& p) {
         std::uint64_t value = std::stoul(p.first);
         popart_options.dotChecks.insert(static_cast<popart::DotCheck>(value));
       });
 
   RegisterGetter(
-      vector_options_getter, options_type, "dotChecks", "vector", [&]() {
+      vector_options_getter, options_type, "dot_checks", "vector", [&]() {
         std::vector<std::string> res;
         for (auto x : popart_options.dotChecks) {
           res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
@@ -293,7 +323,7 @@ IpuStrategy::IpuStrategy() {
         return res;
       });
 
-  RegisterSetter(container_options, "hardwareInstrumentations",
+  RegisterSetter(container_options, "hardware_instrumentations",
                  [&](const std::pair<std::string, std::string>& p) {
                    std::uint64_t value = std::stoul(p.first);
                    popart_options.hardwareInstrumentations.insert(
@@ -301,8 +331,8 @@ IpuStrategy::IpuStrategy() {
                  });
 
   RegisterGetter(
-      vector_options_getter, options_type, "hardwareInstrumentations", "vector",
-      [&]() {
+      vector_options_getter, options_type, "hardware_instrumentations",
+      "vector", [&]() {
         std::vector<std::string> res;
         for (auto x : popart_options.hardwareInstrumentations) {
           res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
@@ -310,12 +340,12 @@ IpuStrategy::IpuStrategy() {
         return res;
       });
 
-  RegisterSetter(container_options, "customCodelets",
+  RegisterSetter(container_options, "custom_codelets",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.customCodelets.push_back(p.first);
                  });
 
-  RegisterGetter(vector_options_getter, options_type, "customCodelets",
+  RegisterGetter(vector_options_getter, options_type, "custom_codelets",
                  "vector", [&]() {
                    std::vector<std::string> res;
                    for (auto x : popart_options.customCodelets) {
@@ -324,44 +354,44 @@ IpuStrategy::IpuStrategy() {
                    return res;
                  });
 
-  RegisterSetter(container_options, "engineOptions",
+  RegisterSetter(container_options, "engine_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.engineOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "engineOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "engine_options", "map",
                  [&]() { return popart_options.engineOptions; });
 
-  RegisterSetter(container_options, "reportOptions",
+  RegisterSetter(container_options, "report_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.reportOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "reportOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "report_options", "map",
                  [&]() { return popart_options.reportOptions; });
 
-  RegisterSetter(container_options, "convolutionOptions",
+  RegisterSetter(container_options, "convolution_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.convolutionOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "convolutionOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "convolution_options", "map",
                  [&]() { return popart_options.convolutionOptions; });
 
-  RegisterSetter(container_options, "lstmOptions",
+  RegisterSetter(container_options, "lstm_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.lstmOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "lstmOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "lstm_options", "map",
                  [&]() { return popart_options.lstmOptions; });
 
-  RegisterSetter(container_options, "gclOptions",
+  RegisterSetter(container_options, "gcl_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.gclOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "gclOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "gcl_options", "map",
                  [&]() { return popart_options.gclOptions; });
 }
 
@@ -415,21 +445,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
         "Unknown tensor location: %s", tensor));
   }
 
-  if (opt == "minElementsForOffChip") {
+  if (opt == "min_elements_for_off_chip") {
     settings->minElementsForOffChip = value;
-  } else if (opt == "minElementsForReplicatedTensorSharding") {
+  } else if (opt == "min_elements_for_replicated_tensor_sharding") {
     settings->minElementsForReplicatedTensorSharding = value;
-  } else if (opt == "onChip") {
+  } else if (opt == "on_chip") {
     settings->location.storage = value > 0 ? popart::TensorStorage::OnChip
                                            : popart::TensorStorage::OffChip;
-  } else if (opt == "useReplicatedTensorSharding") {
+  } else if (opt == "use_replicated_tensor_sharding") {
     settings->location.replicatedTensorSharding =
         value > 0 ? popart::ReplicatedTensorSharding::On
                   : popart::ReplicatedTensorSharding::Off;
-  } else if (opt == "useIOTilesToLoad") {
+  } else if (opt == "use_io_tiles_to_load") {
     settings->location.loadTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
-  } else if (opt == "useIOTilesToStore") {
+  } else if (opt == "use_io_tiles_to_store") {
     settings->location.storageTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
   } else {
@@ -464,6 +494,20 @@ std::string IpuStrategy::GetOptionType(const std::string& option) {
   return options_type[option];
 }
 
+std::vector<std::string> IpuStrategy::GetAllOptionNames() {
+  std::vector<std::string> names;
+  for (auto& option : options_getter) {
+    names.push_back(option.first);
+  }
+  for (auto& option : vector_options_getter) {
+    names.push_back(option.first);
+  }
+  for (auto& option : map_options_getter) {
+    names.push_back(option.first);
+  }
+  return names;
+}
+
 void IpuStrategy::EnablePattern(const std::string& t) {
   VLOG(10) << "enable popart pattern: " << t;
   popart_patterns.enablePattern(t, true);
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 64436dc14fec3393b0a2a4473ad436d7d08f5217..571fb1e163718388a779e128fb6aaf76659d7183 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -24,7 +24,8 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-struct IpuStrategy {
+class IpuStrategy {
+ public:
   IpuStrategy();
 
   // TODO(alleng) create PaddleOptions
@@ -75,22 +76,30 @@ struct IpuStrategy {
   // custom ops
   std::vector<IpuCustomOpIdentifier> custom_ops;
 
- private:
-  std::map<std::string, std::function<void(bool)>> bool_options;
-  std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
-  std::map<std::string, std::function<void(double)>> double_options;
-  std::map<std::string, std::function<void(std::string)>> string_options;
-  std::map<std::string,
-           std::function<void(std::pair<std::string, std::string>)>>
-      container_options;
+ public:
+  void AddBoolOption(const std::string &option, bool value);
+  void AddUint64Option(const std::string &option, std::uint64_t value);
+  void AddDoubleOption(const std::string &option, double value);
+  void AddStringOption(const std::string &option, const std::string &value);
+  void InsertStringOption(const std::string &option, const std::string &value);
+  void InsertStringPairOption(const std::string &option, const std::string &key,
+                              const std::string &value);
+  void SetTensorLocation(const std::string &tensor, const std::string &option,
+                         std::uint64_t value);
+  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
+                   const std::string &domain, int version);
 
-  std::map<std::string, std::function<std::string()>> options_getter;
-  std::map<std::string, std::function<std::vector<std::string>()>>
-      vector_options_getter;
-  std::map<std::string, std::function<std::map<std::string, std::string>()>>
-      map_options_getter;
-  std::map<std::string, std::string> options_type;
+  std::string GetOption(const std::string &);
+  std::vector<std::string> GetVectorOption(const std::string &);
+  std::map<std::string, std::string> GetMapOption(const std::string &);
+  std::string GetOptionType(const std::string &);
+  std::vector<std::string> GetAllOptionNames();
+
+  void EnablePattern(const std::string &t);
+  void DisablePattern(const std::string &t);
+  const bool IsPatternEnabled(const std::string &t);
 
+ private:
   template <typename ValueType>
   void set(
       const std::string &key, ValueType value,
@@ -117,27 +126,20 @@ struct IpuStrategy {
     return it->second();
   }
 
- public:
-  void AddBoolOption(const std::string &option, bool value);
-  void AddUint64Option(const std::string &option, std::uint64_t value);
-  void AddDoubleOption(const std::string &option, double value);
-  void AddStringOption(const std::string &option, const std::string &value);
-  void InsertStringOption(const std::string &option, const std::string &value);
-  void InsertStringPairOption(const std::string &option, const std::string &key,
-                              const std::string &value);
-  void SetTensorLocation(const std::string &tensor, const std::string &option,
-                         std::uint64_t value);
-  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
-                   const std::string &domain, int version);
-
-  std::string GetOption(const std::string &);
-  std::vector<std::string> GetVectorOption(const std::string &);
-  std::map<std::string, std::string> GetMapOption(const std::string &);
-  std::string GetOptionType(const std::string &);
+  std::map<std::string, std::function<void(bool)>> bool_options;
+  std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
+  std::map<std::string, std::function<void(double)>> double_options;
+  std::map<std::string, std::function<void(std::string)>> string_options;
+  std::map<std::string,
+           std::function<void(std::pair<std::string, std::string>)>>
+      container_options;
 
-  void EnablePattern(const std::string &t);
-  void DisablePattern(const std::string &t);
-  const bool IsPatternEnabled(const std::string &t);
+  std::map<std::string, std::function<std::string()>> options_getter;
+  std::map<std::string, std::function<std::vector<std::string>()>>
+      vector_options_getter;
+  std::map<std::string, std::function<std::map<std::string, std::string>()>>
+      map_options_getter;
+  std::map<std::string, std::string> options_type;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index efbc56bee720b0b5ed4e6aa91d6a7ad0ad14b197..134ec04030d75f5ab98bd9789b8b5fe87341333f 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -53,6 +53,23 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline HcclDataType ToHCCLDataType(experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == experimental::DataType::FLOAT16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == experimental::DataType::INT64) {
+    return HCCL_DATA_TYPE_INT64;
+  } else if (type == experimental::DataType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == experimental::DataType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e6b08ed7bc340b5150078fe0deb6a3187fb8e17b..14f516235a720c1fb8f46fe6606ac8f0bdb149f9 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -249,7 +249,7 @@ XPUOpMap& get_kl2_ops() {
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::INT64, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                     pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -323,6 +323,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -349,6 +351,8 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace())})},
       {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::BOOL, XPUPlace()),
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index f79ef8505d878b28125aaf84574942fb1698de8b..c5dff84723ccf4f40065f5a1d13cf5cdce8b3a0f 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -30,6 +30,32 @@ XPUOpMap& get_kp_ops() {
   static XPUOpMap s_xpu_kp_kernels{
       {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      // activation op
+      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"celu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"silu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softshrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"ceil", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"floor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log1p", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"brelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"soft_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softsign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_sigmoid",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 6127fcfa8def6f2a6723416c6a29bd41a4871b74..073851433620130a3c3c6d256a4d6ca3b3f74555 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -23,12 +23,9 @@ namespace paddle {
 namespace platform {
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kl2_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kl2_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
 #ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kp_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kp_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
@@ -117,6 +111,22 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU_KP
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version) {
+  std::vector<vartype::Type> res;
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                              : get_kp_ops();
+  if (ops.find(op_name) != ops.end()) {
+    XPUKernelSet& type_set = ops[op_name];
+    for (auto& item : type_set) {
+      res.push_back(item.data_type_);
+    }
+  }
+  return res;
+}
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 455a38e36fe0ad756021eb5ac23c012f65cc0c6a..60926dd9a5660ee13be7d61eb453740207994029 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -31,6 +31,8 @@ bool is_in_xpu_black_list(const std::string& op_name);
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version);
 #endif
 
 std::vector<vartype::Type> get_xpu_op_support_type(
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6a7956628f80464740e3cd812b0b663cc36d6fc6..18ac979b48ef39fbab841927bf41c9c8579a5766 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -465,6 +467,9 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
   InitCuBlasContext();
   InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   InitCuSparseContext();
   InitCuSolverContext();
 #endif
@@ -476,6 +481,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     DestoryCuDNNContext();
     DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    DestoryCuBlasLtContext();
+#endif
     DestoryCuSolverContext();
 #endif
 
@@ -485,6 +493,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     InitCuBlasContext();
     InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    InitCuBlasLtContext();
+#endif
     InitCuSolverContext();
 #endif
   }
@@ -495,6 +506,9 @@ CUDAContext::~CUDAContext() {
   DestoryCuDNNContext();
   DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   DestoryCuSparseContext();
   DestoryCuSolverContext();
 #endif
@@ -551,6 +565,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
   }
   return phi::GPUContext::cublas_handle();
 }
+#if CUDA_VERSION >= 11060
+cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CublasLtHandle()->GetCublasLtHandle();
+  }
+  return phi::GPUContext::cublaslt_handle();
+}
+#endif
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
   if (thread_ctx_.count(this)) {
     return context()->CusparseHandle()->GetCusparseHandle();
@@ -903,7 +925,7 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 CustomDeviceContext::CustomDeviceContext(CustomPlace place)
     : phi::CustomContext(place) {
   Init();
-  stream_.reset(new platform::stream::Stream(place, stream()));
+  stream_.reset(new phi::stream::Stream(place, stream()));
 }
 
 CustomDeviceContext::~CustomDeviceContext() {}
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e9124dfc1f8a7ad3a88c843c1a1573ba3503d80b..e104170ca24954b356232af4e0c3e97cfb222858 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -29,6 +31,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
@@ -72,8 +75,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
 
-#include "paddle/fluid/platform/device/device_ext.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/stream.h"
 
 #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -332,6 +335,12 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  const std::unique_ptr<CublasLtHandleHolder>& CublasLtHandle() const {
+    return cublaslt_handle_;
+  }
+#endif
+
   const std::unique_ptr<CusparseHandleHolder>& CusparseHandle() const {
     return cusparse_handle_;
   }
@@ -348,6 +357,14 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  /*! \brief  Call cublasLt function safely. */
+  inline void CublasLtCall(
+      const std::function<void(blasLtHandle_t)>& callback) const {
+    cublaslt_handle_->Call(callback);
+  }
+#endif
+
   /*! \brief  Call cusparse function safely. */
   inline void CusparseCall(
       const std::function<void(phi::sparseHandle_t)>& callback) const {
@@ -394,6 +411,12 @@ class CUDAContext {
 #endif
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void InitCuBlasLtContext() {
+    cublaslt_handle_.reset(new CublasLtHandleHolder());
+  }
+#endif
+
   void InitCuSparseContext() {
     cusparse_handle_.reset(new CusparseHandleHolder(RawStream()));
   }
@@ -472,6 +495,10 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); }
+#endif
+
   void DestoryCuSparseContext() { cusparse_handle_.reset(); }
 #endif
 
@@ -497,6 +524,9 @@ class CUDAContext {
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  std::unique_ptr<CublasLtHandleHolder> cublaslt_handle_;
+#endif
   cusolverDnHandle_t cusolver_dn_handle_;
   std::unique_ptr<CusparseHandleHolder> cusparse_handle_;
 #endif
@@ -559,6 +589,7 @@ class CUDADeviceContext : public phi::GPUContext {
   rocblas_handle cublas_handle() const;
 #else
   cublasHandle_t cublas_handle() const;
+  cublasLtHandle_t cublaslt_handle() const;
   cusparseHandle_t cusparse_handle() const;
 #endif
 
@@ -838,7 +869,7 @@ class CustomDeviceContext : public phi::CustomContext {
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
  private:
-  std::shared_ptr<platform::stream::Stream> stream_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 template <>
 struct DefaultDeviceContextType<platform::CustomPlace> {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 87aa5dcde626bafd5e605cc9e35de7cf1b589569..1f95e1212710412cfae1c7284f08a341d8d0dc48 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -46,8 +46,6 @@ if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack)
-add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
deleted file mode 100644
index 59e04dbd2a1e78912d16f01c2d26048d9c74aed5..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/lapack.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <complex>
-#include <mutex>
-#include "paddle/phi/backends/dynload/lapack.h"
-#include "paddle/phi/common/complex.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                     \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
-  DYNAMIC_LOAD_LAPACK_WRAP(__name)
-
-#define LAPACK_ROUTINE_EACH(__macro) \
-  __macro(dgetrf_);                  \
-  __macro(sgetrf_);                  \
-  __macro(zheevd_);                  \
-  __macro(cheevd_);                  \
-  __macro(dsyevd_);                  \
-  __macro(ssyevd_);                  \
-  __macro(dgeev_);                   \
-  __macro(sgeev_);                   \
-  __macro(zgeev_);                   \
-  __macro(cgeev_);                   \
-  __macro(dgels_);                   \
-  __macro(sgels_);                   \
-  __macro(dgelsd_);                  \
-  __macro(sgelsd_);                  \
-  __macro(dgelsy_);                  \
-  __macro(sgelsy_);                  \
-  __macro(dgelss_);                  \
-  __macro(sgelss_);                  \
-  __macro(zpotrs_);                  \
-  __macro(cpotrs_);                  \
-  __macro(dpotrs_);                  \
-  __macro(spotrs_);
-
-LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
-
-#undef DYNAMIC_LOAD_LAPACK_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index f26c4fdd17ad7290c71eddf80874f7fa9e115e4f..39eefab774dbe84801bda98c9821d8c801e7fd25 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
-#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define INT_BITS 32
 
@@ -25,7 +25,7 @@ namespace platform {
 struct FastDivMod {
   // 1st value represents the result of input number divides by recorded divisor
   // 2nd value represents the result of input number modulo by recorded divisor
-  using DivModT = AlignedVector<uint32_t, 2>;
+  using DivModT = phi::AlignedVector<uint32_t, 2>;
 
   FastDivMod() {}
   HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 71fd0d20143a08b065eb596d6f5f9ac6531057f4..293a71dbd968c6068579527bf7294453d39f96d9 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/place.h"
 
 #ifdef PADDLE_WITH_XPU
@@ -54,7 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
 
-#include "paddle/fluid/framework/custom_kernel.h"
+#include "paddle/phi/core/custom_kernel.h"
 
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
@@ -144,7 +145,7 @@ void InitCupti() {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 void LoadCustomDevice(const std::string &library_dir) {
   LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]";
-  std::vector<std::string> libs = platform::ListAllLibraries(library_dir);
+  std::vector<std::string> libs = phi::ListAllLibraries(library_dir);
   for (const auto &lib_path : libs) {
     auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
     PADDLE_ENFORCE_NOT_NULL(
@@ -152,15 +153,17 @@ void LoadCustomDevice(const std::string &library_dir) {
         platform::errors::InvalidArgument(
             "Fail to open library: %s with error: %s", lib_path, dlerror()));
 
-    platform::LoadCustomRuntimeLib(lib_path, dso_handle);
-    framework::LoadCustomKernelLib(lib_path, dso_handle);
+    phi::LoadCustomRuntimeLib(lib_path, dso_handle);
   }
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
   LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir
             << "]";
 }
 #endif
 
 void InitDevices() {
+  // set name at the entry point of Paddle
+  platform::SetCurrentThreadName("MainThread");
 // CUPTI attribute should be set before any CUDA context is created (see CUPTI
 // documentation about CUpti_ActivityAttribute).
 #ifdef PADDLE_WITH_CUDA
@@ -256,9 +259,9 @@ void InitDevices(const std::vector<int> devices) {
       LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
       LoadCustomDevice(custom_kernel_root);
 
-      auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+      auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
       for (auto &dev_type : device_types) {
-        auto device_count = platform::DeviceManager::GetDeviceCount(dev_type);
+        auto device_count = phi::DeviceManager::GetDeviceCount(dev_type);
         LOG(INFO) << "CustomDevice: " << dev_type
                   << ", visible devices count: " << device_count;
         for (size_t i = 0; i < device_count; i++) {
diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc
index b309bb985122d8bbe28a8014bca51b4a5a6b9b10..b3311f1d19e6304a0b232cd936397559224e9b96 100644
--- a/paddle/fluid/platform/os_info_test.cc
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
   using paddle::platform::GetCurrentThreadName;
   using paddle::platform::SetCurrentThreadName;
   using paddle::platform::GetAllThreadNames;
-  EXPECT_EQ("unset", GetCurrentThreadName());
-  EXPECT_TRUE(SetCurrentThreadName("MainThread"));
+  SetCurrentThreadName("MainThread");
   EXPECT_FALSE(SetCurrentThreadName("MainThread"));
   auto names = GetAllThreadNames();
   EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 866bf3c66aa2a077f406921da15ee1fa2acd7d40..feb72bce72bf8c9c13260d53d65020a68ba85eb8 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -489,6 +489,10 @@ void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
 
 void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; }
 
+void DisableHostEventRecorder() {
+  FLAGS_enable_host_event_recorder_hook = false;
+}
+
 std::string PrintHostEvents() {
   std::ostringstream oss;
   auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 122e19b7c28080c3c7b38ba90cd8d5c3d61e6ecf..78275341cbbf747d618eeb5f96352325d29bbafb 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -216,6 +216,7 @@ void NvprofEnableRecordEvent();
 void NvprofDisableRecordEvent();
 
 void EnableHostEventRecorder();
+void DisableHostEventRecorder();
 
 // Defined for UT
 std::string PrintHostEvents();
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 5acdfa39569f037fb0db5fbb0037f6ce42d2bac0..c903a52530ccba8a6e0c0ccca27771617f488025 100755
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -2,10 +2,12 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
 cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
-cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
-cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
 add_subdirectory(dump)
+cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
+cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
 cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
+cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
 cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
+cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 7b207ea7b20118064f447ad9bec83b0b0b793ca0..4061e2d4d494d55c703e411e692200d1fd28ba5b 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -18,40 +18,17 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
 namespace platform {
 
 static const char* kSchemaVersion = "1.0.0";
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
-static uint32_t num_span = 0;
-
-static int64_t nsToUs(int64_t ns) { return ns / 1000; }
-
-template <typename... Args>
-std::string string_format(const std::string& format, Args... args) {
-  int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
-               1;  // Extra space for '\0'
-  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
-                                   "Error during profiler data formatting."));
-  auto size = static_cast<size_t>(size_s);
-  auto buf = std::make_unique<char[]>(size);
-  std::snprintf(buf.get(), size, format.c_str(), args...);
-  return std::string(buf.get(), size - 1);  // exclude the '\0'
-}
-
-std::string GetStringFormatLocalTime() {
-  std::time_t rawtime;
-  std::tm* timeinfo;
-  char buf[100];
-  std::time(&rawtime);
-  timeinfo = std::localtime(&rawtime);
-  std::strftime(buf, 100, "%F-%X", timeinfo);
-  return std::string(buf);
-}
+static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
@@ -60,16 +37,19 @@ static std::string DefaultFileName() {
 }
 
 const char* ChromeTracingLogger::categary_name_[] = {
-    "operator", "dataloader", "profile_step", "cuda_runtime", "kernel",
-    "memcpy",   "memset",     "user_defined", "others"};
+    "Operator",      "Dataloader", "ProfileStep",      "CudaRuntime",
+    "Kernel",        "Memcpy",     "Memset",           "UserDefined",
+    "OperatorInner", "Forward",    "Backward",         "Optimization",
+    "Communication", "PythonOp",   "PythonUserDefined"};
 
 void ChromeTracingLogger::OpenFile() {
   output_file_stream_.open(filename_,
                            std::ofstream::out | std::ofstream::trunc);
   if (!output_file_stream_) {
-    VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
+    LOG(WARNING) << "Unable to open file for writing profiling data."
+                 << std::endl;
   } else {
-    VLOG(0) << "writing profiling data to " << filename_ << std::endl;
+    LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
   }
 }
 
@@ -122,21 +102,54 @@ void ChromeTracingLogger::LogHostTraceEventNode(
   if (!output_file_stream_) {
     return;
   }
-  output_file_stream_ << string_format(
-      std::string(
-          R"JSON(
+  switch (host_node.Type()) {
+    case TracerEventType::ProfileStep:
+    case TracerEventType::Forward:
+    case TracerEventType::Backward:
+    case TracerEventType::Dataloader:
+    case TracerEventType::Optimization:
+    case TracerEventType::PythonOp:
+    case TracerEventType::PythonUserDefined:
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
+    "name": "%s", "pid": %lld, "tid": "%lld(Python)",
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
-      
+      "start_ns": %lld,
+      "end_ns": %lld
     }
   },
   )JSON"),
-      host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
-      nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
-      categary_name_[static_cast<int>(host_node.Type())]);
+          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          categary_name_[static_cast<int>(host_node.Type())],
+          host_node.StartNs(), host_node.EndNs());
+      break;
+    default:
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
+  { 
+    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, "dur": %lld,
+    "ph": "X", "cat": "%s", 
+    "args": {
+      "start_ns": %lld,
+      "end_ns": %lld
+    }
+  },
+  )JSON"),
+          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          categary_name_[static_cast<int>(host_node.Type())],
+          host_node.StartNs(), host_node.EndNs());
+      break;
+  }
+
+  pid_tid_set_.insert({host_node.ProcessId(), host_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogRuntimeTraceEventNode(
@@ -148,11 +161,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       std::string(
           R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
+    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
-      "correlation id": %d
+      "correlation id": %d,
+      "start_ns": %lld,
+      "end_ns": %lld
     }
   },
   )JSON"),
@@ -160,7 +175,23 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()),
       nsToUs(runtime_node.Duration()),
       categary_name_[static_cast<int>(runtime_node.Type())],
-      runtime_node.CorrelationId());
+      runtime_node.CorrelationId(), runtime_node.StartNs(),
+      runtime_node.EndNs());
+  pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
+
+  output_file_stream_ << string_format(
+      std::string(
+          R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, 
+    "ph": "s", "cat": "async"
+  },
+  )JSON"),
+      runtime_node.CorrelationId(), runtime_node.ProcessId(),
+      runtime_node.ThreadId(),
+      nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1));
+  pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogDeviceTraceEventNode(
@@ -180,6 +211,36 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
     default:
       break;
   }
+  if (nsToUs(device_node.Duration()) == 0) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
+    "ts": %lld, 
+    "ph": "f", "cat": "async"
+  },
+  )JSON"),
+        device_node.CorrelationId(), device_node.DeviceId(),
+        device_node.StreamId(), nsToUs(device_node.StartNs()));
+    deviceid_streamid_set_.insert(
+        {device_node.DeviceId(), device_node.StreamId()});
+  } else {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
+    "ts": %lld, 
+    "ph": "f", "cat": "async", "bp": "e"
+  },
+  )JSON"),
+        device_node.CorrelationId(), device_node.DeviceId(),
+        device_node.StreamId(),
+        nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1));
+    deviceid_streamid_set_.insert(
+        {device_node.DeviceId(), device_node.StreamId()});
+  }
 }
 
 void ChromeTracingLogger::HandleTypeKernel(
@@ -188,16 +249,21 @@ void ChromeTracingLogger::HandleTypeKernel(
   float blocks_per_sm = 0.0;
   float warps_per_sm = 0.0;
   float occupancy = 0.0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUPTI)
   constexpr int threads_per_warp = 32;
   const gpuDeviceProp& device_property =
       GetDeviceProperties(device_node.DeviceId());
-  blocks_per_sm =
-      (kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) /
-      device_property.multiProcessorCount;
+  blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
+                                     kernel_info.grid_z) /
+                  device_property.multiProcessorCount;
   warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
                                   kernel_info.block_z) /
                  threads_per_warp;
+  occupancy = CalculateEstOccupancy(
+      device_node.DeviceId(), kernel_info.registers_per_thread,
+      kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
+      kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
+      blocks_per_sm);
 #endif
 
   output_file_stream_ << string_format(
@@ -208,15 +274,17 @@ void ChromeTracingLogger::HandleTypeKernel(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "registers per thread": %d,
-      "shared memory": %f,
+      "shared memory": %d,
       "blocks per SM": %f,
       "warps per SM": %f,
       "grid": [%d, %d, %d],
       "block": [%d, %d, %d],
-      "est. achieved occupancy %": %f
+      "theoretical achieved occupancy %%": %f
     }
   },
   )JSON"),
@@ -224,12 +292,13 @@ void ChromeTracingLogger::HandleTypeKernel(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
+      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), kernel_info.registers_per_thread,
       kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
       blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y,
       kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y,
-      kernel_info.block_z, occupancy);
+      kernel_info.block_z, occupancy * 100);
 }
 
 void ChromeTracingLogger::HandleTypeMemcpy(
@@ -247,6 +316,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "memory bandwidth (GB/s)": %f
     }
@@ -256,8 +327,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.StreamId(), device_node.CorrelationId(),
-      memcpy_info.num_bytes, memory_bandwidth);
+      device_node.StartNs(), device_node.EndNs(), device_node.StreamId(),
+      device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
 }
 
 void ChromeTracingLogger::HandleTypeMemset(
@@ -271,6 +342,8 @@ void ChromeTracingLogger::HandleTypeMemset(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "value": %d
@@ -281,7 +354,8 @@ void ChromeTracingLogger::HandleTypeMemset(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
+      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
 }
 
@@ -290,10 +364,10 @@ void ChromeTracingLogger::StartLog() {
                                            R"JSON(
   { 
     "schemaVersion": "%s",
-    "displayTimeUnit": "us",
-    "SpanNumber": "%d",
+    "displayTimeUnit": "ms",
+    "span_indx": "%d",
   )JSON"),
-                                       kSchemaVersion, num_span);
+                                       kSchemaVersion, span_indx++);
 // add device property information
 #if defined(PADDLE_WITH_CUDA)
   output_file_stream_ << std::string(R"JSON(
@@ -358,11 +432,143 @@ void ChromeTracingLogger::StartLog() {
   )JSON");
 }
 
-void ChromeTracingLogger::EndLog() {
+void ChromeTracingLogger::LogMetaInfo(
+    const std::unordered_map<std::string, std::string> extra_info) {
+  RefineDisplayName(extra_info);
   output_file_stream_ << std::string(
       R"JSON(
   {}
-  ]
+  ],
+  )JSON");
+  output_file_stream_ << std::string(R"JSON(
+  "ExtraInfo": {)JSON");
+  size_t count = extra_info.size();
+  for (const auto& kv : extra_info) {
+    if (count > 1) {
+      output_file_stream_ << string_format(std::string(R"JSON(
+     "%s": "%s",
+   )JSON"),
+                                           kv.first.c_str(), kv.second.c_str());
+    } else {
+      output_file_stream_ << string_format(std::string(R"JSON(
+     "%s": "%s"
+   )JSON"),
+                                           kv.first.c_str(), kv.second.c_str());
+    }
+    count--;
+  }
+  output_file_stream_ << std::string(R"JSON(
+  })JSON");
+}
+
+void ChromeTracingLogger::RefineDisplayName(
+    std::unordered_map<std::string, std::string> extra_info) {
+  for (auto it = pid_tid_set_.begin(); it != pid_tid_set_.end(); ++it) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  {
+    "name": "process_name", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "name": "Process %lld (CPU)"
+    }
+  },
+  {
+    "name": "process_name", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "name": "Process %lld (CPU)"
+    }
+  },
+   {
+    "name": "thread_name", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "name": "thread %lld:%s(Python)"
+    }
+  },
+  {
+    "name": "thread_name", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "name": "thread %lld:%s(C++)"
+    }
+  },
+  {
+    "name": "process_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },
+  )JSON"),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).first, (*it).first, (*it).second, (*it).second,
+        extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
+        (*it).first, (*it).second, (*it).second,
+        extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
+  }
+
+  for (auto it = deviceid_streamid_set_.begin();
+       it != deviceid_streamid_set_.end(); ++it) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  {
+    "name": "process_name", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "name": "Deivce %lld (GPU)"
+    }
+  },
+   {
+    "name": "thread_name", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "name": "stream %lld"
+    }
+  },
+  {
+    "name": "process_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  )JSON"),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
+        (*it).first, (*it).second, (*it).second);
+  }
+}
+
+void ChromeTracingLogger::EndLog() {
+  output_file_stream_ << std::string(
+      R"JSON(
   }
   )JSON");
 }
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 06734418609d7def27c16a5ddd9e15589ee7a02a..20a924a54cabdf94e7aefece6b4aeffcdd848340 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -13,11 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <set>
+#include <unordered_map>
+#include <utility>
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
 namespace paddle {
 namespace platform {
 
+// Dump a NodeTrees into a chrome tracing file.
+// A ChromeTracingLogger object can only dump a NodeTrees object,
+// creates a file in the constructor and closes the file in the destructor.
+// should only call LogNodeTrees and LogMetaInfo in order.
 class ChromeTracingLogger : public BaseLogger {
  public:
   explicit ChromeTracingLogger(const std::string& filename);
@@ -28,6 +35,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogHostTraceEventNode(const HostTraceEventNode&) override;
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
+  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
 
  private:
   void OpenFile();
@@ -36,9 +44,12 @@ class ChromeTracingLogger : public BaseLogger {
   void HandleTypeMemcpy(const DeviceTraceEventNode&);
   void StartLog();
   void EndLog();
+  void RefineDisplayName(std::unordered_map<std::string, std::string>);
   std::string filename_;
   std::ofstream output_file_stream_;
   static const char* categary_name_[];
+  std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
+  std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index 672a9a154535a1cb76a4bbc2bde074b6eecefd9e..ce2e49a1ccd39accb8830943759d361d15d12d9d 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -54,19 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    while (true) {
-      int retval = fscanf(
-          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-          temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
-          &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
-          &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
-      if (std::string(temp_str).find("cpu") != 0) {
-        break;
-      }
-      if (retval != 11) {
-        return;
-      }
+    int retval = fscanf(
+        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+        temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+        &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_,
+        &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+    if (retval != 11) {
+      LOG(WARNING)
+          << "Failed to read cpu utilization information at record beginning."
+          << std::endl;
     }
     fclose(stat_file);
   }
@@ -90,19 +87,17 @@ void CpuUtilization::RecordEndTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    while (true) {
-      int retval = fscanf(
-          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-          temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
-          &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
-          &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
-      if (std::string(temp_str).find("cpu") != 0) {
-        break;
-      }
-      if (retval != 11) {
-        return;
-      }
+    int retval = fscanf(
+        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+        temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+        &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+        &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+
+    if (retval != 11) {
+      LOG(WARNING)
+          << "Failed to read cpu utilization information at record end."
+          << std::endl;
     }
     fclose(stat_file);
   }
diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
index e25333f7a8a73864137a85bc64fe28506b86e081..5045c56afbc639848177d5fd41e212d46015e743 100644
--- a/paddle/fluid/platform/profiler/dump/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
@@ -1,4 +1 @@
 proto_library(nodetreeproto SRCS nodetree.proto)
-cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
-cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
-cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS  serialization_logger deserialization_reader event_node)
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index d1049a7dc190845dc91013f688a27224f5e26b0e..de3411579d3e9f72959a26e0e870e079e08913e8 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
-
 #include <cstring>
+#include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
 namespace platform {
@@ -36,11 +36,19 @@ void DeserializationReader::OpenFile() {
   }
 }
 
-std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
+std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
     VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
     return nullptr;
   }
+  // restore extra info
+  ExtraInfo extrainfo;
+  for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) {
+    ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx);
+    extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"),
+                           extra_info_map.value().c_str());
+  }
+  // restore NodeTrees
   std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
   for (int node_tree_index = 0;
        node_tree_index < node_trees_proto_->thread_trees_size();
@@ -95,7 +103,9 @@ std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
     }
   }
   // restore NodeTrees object
-  return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
+  std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
+  return std::unique_ptr<ProfilerResult>(
+      new ProfilerResult(std::move(tree), extrainfo));
 }
 
 DeserializationReader::~DeserializationReader() {
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index 1ad2dabf229ad5665db6cc9f9ec43470f0b232f3..e6feb4f9489e886e534b3e3f153473dd362c8a9c 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
-#include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 
 namespace paddle {
 namespace platform {
@@ -24,7 +24,7 @@ class DeserializationReader {
   explicit DeserializationReader(const std::string& filename);
   explicit DeserializationReader(const char* filename);
   ~DeserializationReader();
-  std::unique_ptr<NodeTrees> Parse();
+  std::unique_ptr<ProfilerResult> Parse();
 
  private:
   void OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
index 37dac0e597ce208da05271ff88c6f28b3c9dd9f9..7016745059d402f3db485e0a6fef41e0e7e39dae 100644
--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -32,9 +32,21 @@ enum TracerEventTypeProto {
   Memset = 6;
   // Used to mark record defined by user
   UserDefined = 7;
-  // A flag to denote the number of current types
-  NumTypes = 8;
-}
+  // Used to mark operator detail, (such as infer shape, compute)
+  OperatorInner = 8;
+  // Used to mark model training or testing perspective, forward process
+  Forward = 9;
+  // Used to mark model training perspective, backward process
+  Backward = 10;
+  // Used to mark model training perspective, optimization process
+  Optimization = 11;
+  // Used to mark distributed training perspective
+  Communication = 12;
+  // Used to mark python api
+  PythonOp = 13;
+  // Used to mark python level userdefined
+  PythonUserDefined = 14;
+};
 
 message KernelEventInfoProto {
   // The X-dimension block size for the kernel.
@@ -175,7 +187,14 @@ message ThreadNodeTreeProto {
   repeated HostTraceEventNodeProto host_nodes = 2;
 }
 
+message ExtraInfoMap {
+  required string key = 1;
+  required string value = 2;
+}
+
 message NodeTreesProto {
   required string version = 1;
-  repeated ThreadNodeTreeProto thread_trees = 2;
+  required uint32 span_indx = 2;
+  repeated ThreadNodeTreeProto thread_trees = 3;
+  repeated ExtraInfoMap extra_info = 4;
 }
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index d9ed84bd438a7e2ac95a6637b6efcae870a8ad75..73021f4362af5df52129dac394de94e59f6e5f3b 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
@@ -20,6 +21,7 @@ namespace platform {
 
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
 static const char* version = "1.0.0";
+static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
@@ -39,6 +41,7 @@ void SerializationLogger::OpenFile() {
   }
   node_trees_proto_ = new NodeTreesProto();
   node_trees_proto_->set_version(std::string(version));
+  node_trees_proto_->set_span_indx(span_indx++);
 }
 
 void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
@@ -240,6 +243,15 @@ void SerializationLogger::HandleTypeMemset(
       device_trace_event);
 }
 
+void SerializationLogger::LogMetaInfo(
+    const std::unordered_map<std::string, std::string> extra_info) {
+  for (const auto& kv : extra_info) {
+    ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
+    extra_info_map->set_key(kv.first);
+    extra_info_map->set_value(kv.second);
+  }
+}
+
 SerializationLogger::SerializationLogger(const std::string& filename) {
   filename_ = filename.empty() ? DefaultFileName() : filename;
   OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
old mode 100755
new mode 100644
index 1295be95d45316d6884b68b3115caefa7905d673..378834cff590d03090ca9030eb962a801abb9c4d
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -11,6 +11,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
+
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
@@ -20,6 +22,7 @@ namespace platform {
 // Dump a NodeTrees into a profobuf file.
 // A SerializationLogger object can only dump a NodeTrees object,
 // creates a file in the constructor and closes the file in the destructor.
+// Should only call LogNodeTrees and LogMetaInfo.
 class SerializationLogger : public BaseLogger {
  public:
   explicit SerializationLogger(const std::string& filename);
@@ -30,12 +33,14 @@ class SerializationLogger : public BaseLogger {
   void LogHostTraceEventNode(const HostTraceEventNode&) override;
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
+  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
 
  private:
   void OpenFile();
   void HandleTypeKernel(const DeviceTraceEventNode&);
   void HandleTypeMemset(const DeviceTraceEventNode&);
   void HandleTypeMemcpy(const DeviceTraceEventNode&);
+
   std::string filename_;
   std::ofstream output_file_stream_;
   NodeTreesProto* node_trees_proto_;
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index 2fe9626ec76df5654d19e785d043311f5f00496e..dee1019da2b52f93a2c2383f8f38c2c1662e63cb 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 
 using paddle::platform::SerializationLogger;
 using paddle::platform::DeserializationReader;
@@ -31,6 +32,7 @@ using paddle::platform::TracerEventType;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::ProfilerResult;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
@@ -149,7 +151,8 @@ TEST(SerializationLoggerTest, dump_case1) {
 
 TEST(DeserializationReaderTest, restore_case0) {
   DeserializationReader reader("test_serialization_logger_case0.pb");
-  std::unique_ptr<NodeTrees> tree = reader.Parse();
+  auto profiler_result = reader.Parse();
+  auto& tree = profiler_result->GetNodeTrees();
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree->Traverse(true);
   EXPECT_EQ(nodes[10].size(), 4u);
@@ -172,3 +175,26 @@ TEST(DeserializationReaderTest, restore_case0) {
     }
   }
 }
+
+TEST(DeserializationReaderTest, restore_case1) {
+  DeserializationReader reader("test_serialization_logger_case1.pb");
+  auto profiler_result = reader.Parse();
+  auto& tree = profiler_result->GetNodeTrees();
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree->Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 1u);
+  EXPECT_EQ(nodes[11].size(), 1u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+}
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a6f19d2f93aff104e2100b7591982a7a1d6400f
--- /dev/null
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
+
+namespace paddle {
+namespace platform {
+
+HostPythonNode::~HostPythonNode() {
+  // delete all runtime or device nodes and recursive delete children
+  for (auto it = children_node_ptrs.begin(); it != children_node_ptrs.end();
+       ++it) {
+    delete *it;
+  }
+  for (auto it = runtime_node_ptrs.begin(); it != runtime_node_ptrs.end();
+       ++it) {
+    delete *it;
+  }
+  for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) {
+    delete *it;
+  }
+}
+
+HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
+  // Copy and transfer EventNode in NodeTree to PythonNode
+  if (root == nullptr) {
+    return nullptr;
+  }
+  // copy HostTraceEventNode and its children
+  HostPythonNode* host_python_node = new HostPythonNode();
+  host_python_node->name = root->Name();
+  host_python_node->type = root->Type();
+  host_python_node->start_ns = root->StartNs();
+  host_python_node->end_ns = root->EndNs();
+  host_python_node->process_id = root->ProcessId();
+  host_python_node->thread_id = root->ThreadId();
+  for (auto it = root->GetChildren().begin(); it != root->GetChildren().end();
+       ++it) {
+    host_python_node->children_node_ptrs.push_back(CopyTree(*it));
+  }
+  // copy its CudaRuntimeTraceEventNode
+  for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin();
+       runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) {
+    HostPythonNode* runtime_python_node = new HostPythonNode();
+    runtime_python_node->name = (*runtimenode)->Name();
+    runtime_python_node->type = (*runtimenode)->Type();
+    runtime_python_node->start_ns = (*runtimenode)->StartNs();
+    runtime_python_node->end_ns = (*runtimenode)->EndNs();
+    runtime_python_node->process_id = (*runtimenode)->ProcessId();
+    runtime_python_node->thread_id = (*runtimenode)->ThreadId();
+    host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
+    // copy DeviceTraceEventNode
+    for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
+         devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
+         ++devicenode) {
+      DevicePythonNode* device_python_node = new DevicePythonNode();
+      device_python_node->name = (*devicenode)->Name();
+      device_python_node->type = (*devicenode)->Type();
+      device_python_node->start_ns = (*devicenode)->StartNs();
+      device_python_node->end_ns = (*devicenode)->EndNs();
+      device_python_node->device_id = (*devicenode)->DeviceId();
+      device_python_node->context_id = (*devicenode)->ContextId();
+      device_python_node->stream_id = (*devicenode)->StreamId();
+      runtime_python_node->device_node_ptrs.push_back(device_python_node);
+    }
+  }
+  return host_python_node;
+}
+
+ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
+                               const ExtraInfo& extra_info)
+    : tree_(std::move(tree)), extra_info_(extra_info) {
+  if (tree_ != nullptr) {
+    std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
+    for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
+      thread_event_trees_map_[it->first] = CopyTree(it->second);
+    }
+  }
+}
+
+ProfilerResult::~ProfilerResult() {
+  // delete all root nodes
+  for (auto it = thread_event_trees_map_.begin();
+       it != thread_event_trees_map_.end(); ++it) {
+    delete it->second;
+  }
+}
+
+void ProfilerResult::Save(const std::string& file_name,
+                          const std::string format) {
+  if (format == std::string("json")) {
+    ChromeTracingLogger logger(file_name);
+    tree_->LogMe(&logger);
+    logger.LogMetaInfo(GetExtraInfo());
+  } else if (format == std::string("pb")) {
+    SerializationLogger logger(file_name);
+    tree_->LogMe(&logger);
+    logger.LogMetaInfo(GetExtraInfo());
+  }
+  return;
+}
+
+std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename) {
+  DeserializationReader reader(filename);
+  std::unique_ptr<ProfilerResult> result = reader.Parse();
+  return result;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index b0d8eaa2427163a244224ec93e3a4e6b8d97dd25..12ecb9fde32aa0aedf9bf70e8e5bdf3d6c34b69f 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -15,8 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
+#include <unordered_map>
 
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
 namespace platform {
@@ -66,18 +69,29 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-  explicit ProfilerResult(NodeTrees* tree);
+  explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
+                          const ExtraInfo& extra_info);
   ~ProfilerResult();
   std::map<uint64_t, HostPythonNode*> GetData() {
-    return thread_event_trees_map;
+    return thread_event_trees_map_;
   }
-  void Save(const std::string& file_name);
+  std::unordered_map<std::string, std::string> GetExtraInfo() {
+    return extra_info_.GetExtraInfo();
+  }
+
+  void Save(const std::string& file_name,
+            const std::string format = std::string("json"));
+
+  std::unique_ptr<NodeTrees>& GetNodeTrees() { return tree_; }
 
  private:
-  std::map<uint64_t, HostPythonNode*> thread_event_trees_map;
-  NodeTrees* tree_;
-  HostPythonNode* CopyTree(HostTraceEventNode* node);
+  std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
+  std::unique_ptr<NodeTrees> tree_;
+  ExtraInfo extra_info_;
+  HostPythonNode* CopyTree(HostTraceEventNode* root);
 };
 
+std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename);
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 49f9362527591744dd0685375e0244673a7b3081..afd4135246556624cb022243e0e98b5ad9f9f6da 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -189,7 +189,10 @@ struct ThreadEventSection {
 
 class ThreadEventRecorder {
  public:
-  ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); }
+  ThreadEventRecorder() {
+    thread_id_ = GetCurrentThreadSysId();
+    thread_name_ = GetCurrentThreadName();
+  }
 
   DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
 
@@ -202,7 +205,7 @@ class ThreadEventRecorder {
 
   ThreadEventSection GatherEvents() {
     ThreadEventSection thr_sec;
-    thr_sec.thread_name = GetCurrentThreadName();
+    thr_sec.thread_name = thread_name_;
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
     return thr_sec;
@@ -210,6 +213,7 @@ class ThreadEventRecorder {
 
  private:
   uint64_t thread_id_;
+  std::string thread_name_;
   EventContainer<CommonEvent> base_evt_cntr_;
 };
 
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 5784d6e671bbbc69a7762e5a0e757310fc5e7a3b..46cbb3358c6c4d6b2b17cfc1e549db6376931389 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -25,8 +25,10 @@
 #endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
+#include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
 namespace platform {
@@ -44,10 +46,15 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
 
 Profiler::Profiler(const ProfilerOptions& options) {
   options_ = options;
-  HostTracerOptions host_tracer_options;
-  host_tracer_options.trace_level = options.trace_level;
-  tracers_.emplace_back(new HostTracer(host_tracer_options), true);
-  tracers_.emplace_back(&CudaTracer::GetInstance(), false);
+  std::bitset<32> trace_switch(options_.trace_switch);
+  if (trace_switch.test(kProfileCPUOptionBit)) {
+    HostTracerOptions host_tracer_options;
+    host_tracer_options.trace_level = options_.trace_level;
+    tracers_.emplace_back(new HostTracer(host_tracer_options), true);
+  }
+  if (trace_switch.test(kProfileGPUOptionBit)) {
+    tracers_.emplace_back(&CudaTracer::GetInstance(), false);
+  }
 }
 
 Profiler::~Profiler() { alive_.store(false); }
@@ -63,9 +70,10 @@ void Profiler::Start() {
   for (auto& tracer : tracers_) {
     tracer.Get().StartTracing();
   }
+  cpu_utilization_.RecordBeginTimeInfo();
 }
 
-std::unique_ptr<NodeTrees> Profiler::Stop() {
+std::unique_ptr<ProfilerResult> Profiler::Stop() {
   SynchronizeAllDevice();
   TraceEventCollector collector;
   for (auto& tracer : tracers_) {
@@ -75,7 +83,22 @@ std::unique_ptr<NodeTrees> Profiler::Stop() {
   std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
                                                 collector.RuntimeEvents(),
                                                 collector.DeviceEvents()));
-  return tree;
+  cpu_utilization_.RecordEndTimeInfo();
+  ExtraInfo extrainfo;
+  extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
+                         std::string("%f"),
+                         cpu_utilization_.GetCpuUtilization());
+  extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"),
+                         std::string("%f"),
+                         cpu_utilization_.GetCpuCurProcessUtilization());
+  const std::unordered_map<uint64_t, std::string> thread_names =
+      collector.ThreadNames();
+  for (const auto& kv : thread_names) {
+    extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
+                           std::string("%s"), kv.second.c_str());
+  }
+  return std::unique_ptr<ProfilerResult>(
+      new platform::ProfilerResult(std::move(tree), extrainfo));
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index 4fc1c6daf96c7f30cbd549b23b1a8f23563bc590..f9a8ece050492805226cccce001251c3cd2ad0c2 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -15,12 +15,15 @@
 #pragma once
 
 #include <atomic>
+#include <bitset>
 #include <cstdint>
 #include <functional>
 #include <list>
 #include <memory>
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/profiler/cpu_utilization.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
 
 DECLARE_int64(host_trace_level);
@@ -28,7 +31,11 @@ DECLARE_int64(host_trace_level);
 namespace paddle {
 namespace platform {
 
+static constexpr uint32_t kProfileCPUOptionBit = 0;
+static constexpr uint32_t kProfileGPUOptionBit = 1;
+
 struct ProfilerOptions {
+  uint32_t trace_switch = 0;  // bit 0: cpu, bit 1: gpu
   uint32_t trace_level = FLAGS_host_trace_level;
 };
 
@@ -40,7 +47,7 @@ class Profiler {
 
   void Start();
 
-  std::unique_ptr<NodeTrees> Stop();
+  std::unique_ptr<ProfilerResult> Stop();
 
   ~Profiler();
 
@@ -70,6 +77,7 @@ class Profiler {
   ProfilerOptions options_;
   uint64_t start_ns_ = UINT64_MAX;
   std::list<TracerHolder> tracers_;
+  CpuUtilization cpu_utilization_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index 160c801dc6e3efa0a73ad132cc5509b03f7cffa8..32310b9e86228468a0b7e1e94f302a35bb834f8a 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -22,6 +22,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
 
@@ -30,8 +31,10 @@ TEST(ProfilerTest, TestHostTracer) {
   using paddle::platform::Profiler;
   using paddle::platform::RecordInstantEvent;
   using paddle::platform::TracerEventType;
+  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 2;
+  options.trace_switch = 3;
   auto profiler = Profiler::Create(options);
   EXPECT_TRUE(profiler);
   profiler->Prepare();
@@ -42,7 +45,8 @@ TEST(ProfilerTest, TestHostTracer) {
     RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
                        3);
   }
-  auto nodetree = profiler->Stop();
+  auto profiler_result = profiler->Stop();
+  auto& nodetree = profiler_result->GetNodeTrees();
   std::set<std::string> host_events;
   for (const auto pair : nodetree->Traverse(true)) {
     for (const auto evt : pair.second) {
@@ -56,8 +60,10 @@ TEST(ProfilerTest, TestHostTracer) {
 TEST(ProfilerTest, TestCudaTracer) {
   using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 0;
+  options.trace_switch = 3;
   auto profiler = Profiler::Create(options);
   EXPECT_TRUE(profiler);
   profiler->Prepare();
@@ -72,7 +78,8 @@ TEST(ProfilerTest, TestCudaTracer) {
   hipStreamCreate(&stream);
   hipStreamSynchronize(stream);
 #endif
-  auto nodetree = profiler->Stop();
+  auto profiler_result = profiler->Stop();
+  auto& nodetree = profiler_result->GetNodeTrees();
   std::vector<std::string> runtime_events;
   for (const auto pair : nodetree->Traverse(true)) {
     for (const auto host_node : pair.second) {
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index 61f96218560ecf56502fc945bddf4486e7f624d3..16ef62fb51555182c07888819e5eee9be61b42c0 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -48,6 +48,8 @@ enum class TracerEventType {
   Communication = 12,
   // Used to mark python api
   PythonOp = 13,
+  // Used to mark python level userdefined
+  PythonUserDefined = 14,
   // A flag to denote the number of current types
   NumTypes
 };
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 7fce0296d437a096cd9aa99a30476fc44e34c703..7148afee273fda61f991e386353fc323dd2f2ea2 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,10 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
-  LOG(ERROR) << "failed to call MLULaunchCallback, "
-             << "because mlu not support StreamAddCallback yet. "
-             << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_
+          << " Failed to call MLULaunchCallback, "
+          << "because mlu not support StreamAddCallback yet. "
+          << "function: " << func;
 #endif
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 1f06eda8a2ee5dc8322b5e16e1f7eb2e0703f9a8..8ee22590b6d79b3ff6c22f26090f50c2e21d6275 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -80,11 +80,21 @@ set(PYBIND_SRCS
   communication.cc
   cuda_streams_py.cc)
 
+if (WITH_ONNXRUNTIME)
+  set(PYBIND_DEPS ${PYBIND_DEPS} onnxruntime_predictor)
+endif()
+
 if(NOT ON_INFER)
-  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
+  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
   endif()
+  if (WITH_GLOO)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
+  endif()
+  if(WITH_ASCEND_CL)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
+  endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()
 
@@ -146,6 +156,10 @@ if(WITH_PYTHON)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
   endif(WITH_ASCEND_CL)
 
+  if (WITH_ONNXRUNTIME)
+    list(APPEND OP_FUNCTION_GENERETOR_DEPS onnxruntime_predictor)
+  endif()
+
   if(WITH_CNCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
   endif(WITH_CNCL)
@@ -236,13 +250,26 @@ if(WITH_PYTHON)
         list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
         list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
+    if(WITH_ONNXRUNTIME)
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+    endif()
 
     add_custom_command(OUTPUT ${impl_file}
       COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
       COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
       COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
       DEPENDS ${OP_IMPL_DEPS})
-    if(NOT ON_INFER)
+    if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
       add_custom_command(OUTPUT ${eager_impl_file}
         COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
         COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
@@ -254,6 +281,28 @@ if(WITH_PYTHON)
     # copy these *.so to current directory and append current directory to
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
+    if(WITH_ONNXRUNTIME)
+      if (APPLE)
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
+      else()
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
+      endif()
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${PADDLE2ONNX_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${ONNXRUNTIME_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+    endif()
+
     if(WITH_MKLML)
       ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
@@ -276,7 +325,7 @@ if(WITH_PYTHON)
           COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
           DEPENDS ${OP_IMPL_DEPS}
           VERBATIM)
-    if(NOT ON_INFER)
+    if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
       add_custom_command(OUTPUT ${eager_impl_file}
             COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
                 "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
@@ -288,7 +337,7 @@ if(WITH_PYTHON)
       endif()
   endif(WIN32)
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
-  if(NOT ON_INFER)
+  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
   endif()
 
@@ -296,7 +345,7 @@ if(WITH_PYTHON)
   cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
   list(APPEND PYBIND_DEPS op_function_common)
 
-  if(NOT ON_INFER)
+  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
     DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index b29cc10e8f56f5698874db8b357621aa4a88b238..8491d1e224930939212ed20019f5c78ca1c43f67 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -168,7 +168,7 @@ void BindFleetExecutor(py::module* m) {
       .def("set_run_at_offset", &TaskNode::SetRunAtOffset)
       .def("set_type", &TaskNode::SetType)
       .def("role", &TaskNode::role)
-      .def("init", &TaskNode::Init)
+      .def("init", [](TaskNode& self) { self.Init(); })
       .def("set_program", &TaskNode::SetProgram);
 
   py::class_<DistModelConfig>(*m, "DistModelConfig")
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index a0d2777f825dc592e19230bc2ba4412f943d0c2b..1a6a395545a96b1980cae73ff65de3daef0acafc 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -30,12 +30,42 @@ namespace pybind {
 
 using TCPStore = paddle::distributed::TCPStore;
 
-void BindTCPStore(py::module* m) {
-  py::class_<TCPStore>(*m, "TCPStore")
-      .def(
-          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
-      .def("add", &TCPStore::add)
-      .def("get", &TCPStore::get);
+void BindTCPStore(py::module *m) {
+  auto Store =
+      py::class_<distributed::Store, std::shared_ptr<distributed::Store>>(
+          *m, "Store")
+          .def(py::init<>())
+          .def("set",
+               [](distributed::Store &self, const std::string &key,
+                  const std::string &value) {
+                 std::vector<uint8_t> data(value.begin(), value.end());
+                 self.set(key, data);
+               },
+               py::arg("key"), py::arg("value"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("get",
+               [](distributed::Store &self,
+                  const std::string &key) -> py::bytes {
+                 auto data = self.get(key);
+                 return py::bytes(reinterpret_cast<char *>(data.data()),
+                                  data.size());
+               },
+               py::arg("key"), py::call_guard<py::gil_scoped_release>())
+          .def("add", &distributed::Store::add,
+               py::call_guard<py::gil_scoped_release>())
+          .def("wait", &distributed::Store::wait,
+               py::call_guard<py::gil_scoped_release>());
+
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore", Store)
+      .def(py::init([](std::string hostname, uint16_t port, bool is_master,
+                       size_t world_size, std::chrono::seconds timeout) {
+             return std::make_shared<TCPStore>(hostname, port, is_master,
+                                               world_size, timeout);
+           }),
+           py::arg("hostname"), py::arg("port"), py::arg("is_master"),
+           py::arg("world_size"),
+           py::arg("timeout") = distributed::tcputils::kNoTimeout,
+           py::call_guard<py::gil_scoped_release>());
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a276df0d5bdc95b6c925b7c620d7931b6aaf0ec
--- /dev/null
+++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+
+static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
+                                       PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args),
+                               attrs);
+
+    tstate = PyEval_SaveThread();
+    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    std::cout << "end run_program_dygraph_function" << std::endl;
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+  }
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef CustomEagerFinalStateMethods[] = {
+    {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
+
+    {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index e057fb53ccecc7193fd52b8beda2c4f2880560e8..0b1796703817c28526172a542ae9253578f44ee2 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -34,6 +35,15 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#endif
+
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#endif
+
 namespace py = pybind11;
 
 namespace paddle {
@@ -41,6 +51,14 @@ namespace pybind {
 
 using Tensor = paddle::experimental::Tensor;
 
+#if defined(PADDLE_WITH_GLOO)
+using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
+using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
+using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
+#endif
+
+static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";  // NOLINT
+
 void BindDistributed(py::module *m) {
   py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
       .value("SUM", distributed::ReduceOp::SUM)
@@ -59,6 +77,15 @@ void BindDistributed(py::module *m) {
       .def_readwrite("source_root",
                      &distributed::BroadcastOptions::source_root);
 
+  py::class_<distributed::BarrierOptions>(*m, "BarrierOptions")
+      .def(py::init<>())
+      .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
+
+  py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
+      .def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
+
   auto ProcessGroup =
       py::class_<distributed::ProcessGroup,
                  std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
@@ -87,14 +114,104 @@ void BindDistributed(py::module *m) {
                  return self.Broadcast(tensors, opts);
                },
                py::arg("tensor"), py::arg("source_rank"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("barrier",
+               [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
+                 distributed::BarrierOptions opts;
+                 opts.place_ids = place_ids;
+                 return self.Barrier(opts);
+               },
+               py::arg("place_ids") = std::vector<int>{},
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("send",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  int dst) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.Send(tensors, dst);
+               },
+               py::arg("tensor"), py::arg("dst"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("recv",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  int src) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.Recv(tensors, src);
+               },
+               py::arg("tensor"), py::arg("src"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("all_gather",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllGather(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("alltoall",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllToAll(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("reduce",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  int dst, distributed::ReduceOp op) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 distributed::ReduceOptions opts;
+                 opts.reduce_op = op;
+                 opts.root_rank = dst;
+                 std::vector<Tensor> tensors = {in_tensor};
+                 return self.Reduce(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("dst"),
+               py::arg("op") = distributed::ReduceOp::SUM,
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("scatter",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor, int src) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 distributed::ScatterOptions opts;
+                 opts.root_rank = src;
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.Scatter(in_tensors, out_tensors, opts);
+               },
+               py::arg("in"), py::arg("out"), py::arg("src"),
                py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL)
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
+           py::call_guard<py::gil_scoped_release>());
+#endif
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+  py::class_<distributed::ProcessGroupHCCL,
+             std::shared_ptr<distributed::ProcessGroupHCCL>>(
+      *m, "ProcessGroupHCCL", ProcessGroup)
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
            py::call_guard<py::gil_scoped_release>());
+#endif
 
   py::class_<distributed::ProcessGroup::Task,
              std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
@@ -104,45 +221,56 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
            py::call_guard<py::gil_scoped_release>());
+
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<GlooOptions>(*m, "GlooOptions")
+      .def(py::init<>())
+      .def_readwrite("_device", &GlooOptions::device)
+      .def_static("create", &GlooOptions::create);
+
+  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
+      .def(py::init(
+               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
+                 return std::make_shared<GlooStore>(store);
+               }),
+           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
+      *m, "ProcessGroupGloo", ProcessGroup)
+      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
+                    std::shared_ptr<GlooOptions> &>(),
+           py::call_guard<py::gil_scoped_release>())
+      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
+                       int world_size) {
+             auto opts = GlooOptions::create();
+             char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
+             if (ifname && strlen(ifname) > 1) {
+               opts->device = ProcessGroupGloo::createDeviceForInterface(
+                   std::string(ifname));
+             } else {
+               opts->device = ProcessGroupGloo::createDefaultDevice();
+             }
+             return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
+                                                       opts);
+           }),
+           py::arg("store"), py::arg("rank"), py::arg("world_size"),
+           py::call_guard<py::gil_scoped_release>())
+      .def_static("create_default_device",
+                  &ProcessGroupGloo::createDefaultDevice);
 #endif
 
-  // define parallel strategy, it will be removed
-  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
-      *m, "ProcessGroupStrategy", "");
-  pg_strategy.def(py::init())
-      .def_property("nranks",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nranks_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nranks) {
-                      self.nranks_ = nranks;
-                    })
-      .def_property("local_rank",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       int local_rank) { self.local_rank_ = local_rank; })
-      .def_property(
-          "trainer_endpoints",
-          [](const distributed::ProcessGroupStrategy &self) {
-            return self.trainer_endpoints_;
-          },
-          [](distributed::ProcessGroupStrategy &self,
-             std::vector<std::string> eps) { self.trainer_endpoints_ = eps; })
-      .def_property("current_endpoint",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; })
-      .def_property("nrings",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nrings_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nrings) {
-                      self.nrings_ = nrings;
-                    });
+  m->def("eager_assign_group_by_size",
+         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+            std::vector<size_t> group_size_limits,
+            std::vector<int64_t> tensor_indices) {
+           auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+           return distributed::Eager_AssignGroupBySize(
+               tensors, is_sparse_gradient, group_size_limits, tensor_indices);
+         },
+         py::arg("tensors"), py::arg("is_sparse_gradient"),
+         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
+         py::arg("tensor_indices") = std::vector<int64_t>{},
+         py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f11a2ab2517fb481f184c9b68b2558c999d88ec9..e5f22338dc61543a377d4a94307f834b774257d4 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
@@ -30,10 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
 
 extern PyTypeObject* p_tensor_type;
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
+    paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
+    PADDLE_ENFORCE_EQ(
+        tensor.initialized(), true,
+        paddle::platform::errors::InvalidArgument(
+            "We can only support initialized tensor in slice, however we got "
+            "uninitialized tensor %s, please check your code.",
+            tensor.name()));
+    return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
+        CastPyArg2Tensor(obj, 0).impl().get())));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject* obj) {
+  return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
+}
+
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-// NOTE(wuweilong): Set value and not change self's original place
-static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
-                                         PyObject* kwargs) {
+static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
   EAGER_TRY
-  VLOG(4) << "Value " << self->tensor.name();
-  pybind11::object numpy_value =
-      pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
-  InitTensorWithNumpyValue(self, numpy_value, false);
-  Py_INCREF(Py_None);
-  return Py_None;
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
+  VLOG(4) << "Call _getitem_index_not_tensor";
+  std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
+      decrease_axis, none_axes, infer_flags, list_select_idxs;
+  // if index is a list, list_select_flag will be true
+  bool list_select_flag = false;
+  PADDLE_ENFORCE_EQ(
+      self->tensor.is_initialized(), true,
+      platform::errors::InvalidArgument(
+          "tensor %s has not been initialized, we can only slice initialized "
+          "tensor please init it first with numpy or other tensor.",
+          self->tensor.name()));
+  auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+  ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
+                     &slice_strides, &decrease_axis, &none_axes, &infer_flags,
+                     &list_select_idxs, &list_select_flag);
+
+  auto out = slice_axes.empty() && !list_select_flag
+                 ? self->tensor
+                 : paddle::experimental::Tensor(
+                       egr::Controller::Instance().GenerateUniqueName());
+
+  if (!slice_axes.empty()) {
+    framework::AttributeMap attrs = {{"axes", slice_axes},
+                                     {"starts", slice_starts},
+                                     {"ends", slice_ends},
+                                     {"infer_flags", infer_flags},
+                                     {"decrease_axis", decrease_axis}};
+    std::string op_type = "slice";
+    for (auto stride : slice_strides) {
+      if (stride != 1) {
+        op_type = "strided_slice";
+        attrs.insert({"strides", slice_strides});
+        attrs.erase("decrease_axis");
+        break;
+      }
+    }
+    if (op_type == "slice") {
+      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(),
+                                   std::move(attrs));
+    } else if (op_type == "strided_slice") {
+      out = strided_slice_dygraph_function(self->tensor, attrs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Slice is only support slice and strided_slice, but we got %s which "
+          "is impossible, please check your code first or contact us by "
+          "issue. ",
+          op_type));
+    }
+  }
+
+  if (!none_axes.empty()) {
+    // Deal with cases when all axes are decreased.
+    // After slice, the shape of out is [1], which should have been
+    // [], but Paddle doesn't support scalar.
+    // In order to ensure the correctness of the final shape of out,
+    // one dimension of out needs to be decreased.
+    // For example:
+    // # x.shape: (2,3,4)
+    // out = x[0, 1, 1, None] # out.shape : (1)
+    if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
+      none_axes.pop_back();
+    }
+    if (!none_axes.empty()) {
+      // Deal with cases that decrease_axes is not empty
+      // For example:
+      // # x.shape: (2,3,4)
+      // out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+      for (auto& axis : none_axes) {
+        int len = 0;
+        for (int da : decrease_axis) {
+          if (da < axis) {
+            len++;
+          }
+        }
+        axis -= len;
+      }
+
+      paddle::experimental::Tensor new_out;
+      framework::AttributeMap attrs = {{"axes", none_axes}};
+      new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
+      return ToPyObject(new_out);
+    }
+  }
+
+  // the index is a list
+  if (list_select_flag) {
+    auto select_index = paddle::experimental::Tensor(
+        egr::Controller::Instance().GenerateUniqueName());
+    auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
+        egr::Controller::Instance().GetExpectedPlace());
+    paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
+                                        idx_tensor.get());
+    framework::AttributeMap attrs = {{"dim", 0}};
+    out = index_select_dygraph_function(self->tensor, select_index,
+                                        std::move(attrs));
+  }
+
+  return ToPyObject(out);
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
     {"get_tensor",
      (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
+    {"_getitem_index_not_tensor",
+     (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_register_grad_hook",
      (PyCFunction)(void (*)(void))tensor_register_grad_hook,
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index c15c171799f4421fc3e8b40a84abdbb062709dc7..102cdbb91ab066c4a6d499688bca30c1c3d185ad 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <set>
 #include <string>
+#include <unordered_set>
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -129,6 +130,12 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
 
+// These operators will skip automatical code generatrion and
+// need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE
+std::unordered_set<std::string> CUSTOM_HANDWRITE_OPS_SET = {"run_program"};
+const char* CUSTOM_HANDWRITE_OP_FUNC_FILE =
+  "#include \"paddle/fluid/pybind/custom_handwrite_op_funcs.h\"\n";
+
 // clang-format on
 static inline bool FindInsMap(const std::string& op_type,
                               const std::string& in_name) {
@@ -355,7 +362,7 @@ GenerateOpFunctions() {
 
   std::vector<std::string> op_function_list, bind_function_list;
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-
+  bool append_custom_head_file = false;
   for (auto& pair : op_info_map) {
     auto& op_info = pair.second;
     auto op_proto = op_info.proto_;
@@ -363,7 +370,12 @@ GenerateOpFunctions() {
       continue;
     }
     auto& op_type = op_proto->type();
-    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE.
+    if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) {
+      append_custom_head_file = true;
+      continue;
+    }
+    // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
     // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
@@ -380,6 +392,9 @@ GenerateOpFunctions() {
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
   }
+  if (append_custom_head_file) {
+    op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
+  }
   return std::make_tuple(op_function_list, bind_function_list);
 }
 
@@ -449,6 +464,11 @@ int main(int argc, char* argv[]) {
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < "
+         "0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
       << "}\n\n"
       << "} // namespace pybind\n"
       << "} // namespace paddle\n";
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c1e8822eec22179266d69d3b97890aebe678b187..f4e148cf8dceb5211c368fa00211b2c7b9f0a725 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -16,8 +16,12 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -32,6 +36,7 @@ namespace pybind {
 
 extern PyTypeObject* p_tensor_type;
 
+extern PyTypeObject* g_framework_scope_pytype;
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_place_pytype;
 extern PyTypeObject* g_cudaplace_pytype;
@@ -184,6 +189,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   }
 }
 
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos) {
+  return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
+}
+
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos) {
   std::vector<paddle::experimental::Tensor> result;
@@ -579,14 +589,9 @@ paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
       reinterpret_cast<TensorObject*>(obj)->tensor);
 }
 
-// For Intermediate State Dygraph,
-// we use an uninitialized Tensor to represent dispensable Tensor
-paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
-                                                const std::string& arg_name,
-                                                PyObject* args, ssize_t arg_idx,
-                                                bool dispensable) {
-  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
-
+static paddle::experimental::Tensor& GetTensorFromPyObject(
+    const std::string& op_type, const std::string& arg_name, PyObject* obj,
+    ssize_t arg_idx, bool dispensable) {
   if (PyTuple_Check(obj)) {
     obj = PyTuple_GET_ITEM(obj, 0);
   }
@@ -604,6 +609,16 @@ paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
   return reinterpret_cast<TensorObject*>(obj)->tensor;
 }
 
+// For Intermediate State Dygraph,
+// we use an uninitialized Tensor to represent dispensable Tensor
+paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
+                                                const std::string& arg_name,
+                                                PyObject* args, ssize_t arg_idx,
+                                                bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  return GetTensorFromPyObject(op_type, arg_name, obj, arg_idx, dispensable);
+}
+
 std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable) {
@@ -737,5 +752,182 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
 
   return result;
 }
+
+paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int, float, bool or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // obj could be: int, float, bool, paddle.Tensor
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "int") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (type_name == "float") {
+    float value = CastPyArg2Float(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+
+  } else if (type_name == "bool") {
+    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+
+  } else if (type_name == "paddle.Tensor") {
+    paddle::experimental::Tensor& value = GetTensorFromPyObject(
+        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
+    return paddle::experimental::Scalar(value);
+
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int, float, bool or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // Fake a Scalar
+  return paddle::experimental::Scalar(1.0);
+}
+
+paddle::experimental::ScalarArray CastPyArg2ScalarArray(
+    PyObject* obj, const std::string& op_type, ssize_t arg_pos) {
+  // In case of ScalarArray, only two possible PyObjects:
+  // 1. list of int
+  // 2. Tensor
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // obj could be: int, float, bool, paddle.Tensor
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "list") {
+    std::vector<int> value = CastPyArg2Ints(obj, op_type, arg_pos);
+    return paddle::experimental::ScalarArray(value);
+
+  } else if (type_name == "paddle.Tensor") {
+    paddle::experimental::Tensor& value = GetTensorFromPyObject(
+        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
+    return paddle::experimental::ScalarArray(value);
+
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // Fake a ScalarArray
+  return paddle::experimental::ScalarArray({1});
+}
+
+paddle::framework::Scope* CastPyArg2ScopePtr(PyObject* obj) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_framework_scope_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::framework::Scope*>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "PyObject can not be cast into framework::Scope"));
+  }
+}
+
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+  }
+
+  std::vector<paddle::framework::Scope*> result;
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (list == Py_None) {
+    return {};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+  return result;
+}
+
+paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
+                                                const std::string& op_type,
+                                                ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int or place, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "int") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return static_cast<paddle::experimental::Backend>(value);
+  } else {
+    platform::Place place = CastPyArg2Place(obj, arg_pos);
+    return phi::TransToPhiBackend(place);
+  }
+
+  return paddle::experimental::Backend::CPU;
+}
+
+paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
+                                                  const std::string& op_type,
+                                                  ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "data_type, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
+  return framework::TransToPhiDataType(type);
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 0c721d6124791edda7f41d46dcbbbfcccc80fb95..966a920377b38f160a1d4789ef4b04b61d47f2c1 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -11,11 +11,19 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-
 namespace paddle {
+namespace framework {
+class Scope;
+}
+
 namespace pybind {
 
 typedef struct {
@@ -33,6 +41,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos);
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
@@ -89,6 +99,21 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos);
+
+paddle::experimental::ScalarArray CastPyArg2ScalarArray(
+    PyObject* obj, const std::string& op_type, ssize_t arg_pos);
+
+paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
+                                                const std::string& op_type,
+                                                ssize_t arg_pos);
+
+paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
+                                                  const std::string& op_type,
+                                                  ssize_t arg_pos);
+
 paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
@@ -112,5 +137,10 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
+// end of Slice related methods
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8c5ed2d11830195a6fb70c54d12c9ef3eb3fc8b2..9b373a58181f165b52fe809b817977a076b7d961 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -54,7 +54,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace pybind {
@@ -319,6 +321,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
   }
 }
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
+  if (py::isinstance<imperative::VarBase>(obj)) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
+    return GetSliceIndexFromTensor(
+        py::cast<std::shared_ptr<imperative::VarBase>>(obj)
+            ->Var()
+            .Get<framework::LoDTensor>());
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject *obj) {
+  return py::isinstance<imperative::VarBase>(obj);
+}
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
@@ -360,18 +379,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
 
   return result;
 }
-static bool IsNumpyType(PyObject *obj) {
-  // It is not a good way to judge the type of obj by its type'name. Maybe using
-  // `PyArray_IsScalar` will be better. However, this interface cannot be used
-  // by including pybind11, and it needs to compile with numpy.
-  auto type_name = std::string(Py_TYPE(obj)->tp_name);
-  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
-         type_name == "numpy.int32" || type_name == "numpy.int16";
-}
-
-static bool PyCheckTensor(PyObject *obj) {
-  return py::isinstance<imperative::VarBase>(obj);
-}
 
 // cast numpy type form S to T, this may allocate new memory
 template <class T, class S>
@@ -429,260 +436,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
-static bool PyCheckInteger(PyObject *obj) {
-#if PY_VERSION_HEX < 0x03000000
-  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
-#else
-  return PyLong_Check(obj) && !PyBool_Check(obj);
-#endif
-}
-
-static Py_ssize_t GetSliceIndexFromTensor(
-    const std::shared_ptr<imperative::VarBase> &tensor_index) {
-  const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
-  if (tensor.numel() == 1) {
-    if (framework::TransToProtoVarType(tensor.dtype()) ==
-        framework::proto::VarType::INT32) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
-    } else if (framework::TransToProtoVarType(tensor.dtype()) ==
-               framework::proto::VarType::INT64) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, the type of tensor in slice indices only allows "
-          "int32 and int64, please check the type of index tensor."));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Currently, tensor in slice indices only allows 1 element, "
-        "but received %d.",
-        tensor.numel()));
-  }
-}
-
-// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
-// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
-// Original PySlice_GetIndices return wrong result when
-// slice_item contains long int, such as arr[:180L].
-// NOT sure why this happens !!!
-// Besides, PySlice_GetIndices cannot raise error when float in slice item.
-// So, I make a revised version of PySlice_GetIndices, named to
-// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
-// PySlice_GetIndices in the future.
-static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
-                               Py_ssize_t *start, Py_ssize_t *stop,
-                               Py_ssize_t *step) {
-  /* XXX support long ints */
-  if (r->step == Py_None) {
-    *step = 1;
-  } else {
-    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
-      *step = PyLong_AsLong(r->step);
-    } else if (PyCheckTensor(r->step)) {
-      *step = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->step)->tp_name)));
-    }
-  }
-  if (r->start == Py_None) {
-    *start = *step < 0 ? length - 1 : 0;
-  } else {
-    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
-      *start = PyLong_AsLong(r->start);
-    } else if (PyCheckTensor(r->start)) {
-      *start = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->start)->tp_name)));
-    }
-    if (*start < 0) *start += length;
-    *start = std::max(*start, static_cast<Py_ssize_t>(0));
-  }
-  if (r->stop == Py_None) {
-    *stop = *step < 0 ? -1 : length;
-  } else {
-    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
-      *stop = PyLong_AsLong(r->stop);
-    } else if (PyCheckTensor(r->stop)) {
-      *stop = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->stop)->tp_name)));
-    }
-    if (0 < *step && *stop < 0) *stop += length;
-    *stop = std::min(*stop, length);
-  }
-  if (*stop > length) return -1;
-  if (*start >= length) return -1;
-  if (*step == 0) return -1;
-  return 0;
-}
-
-static void ParseIndexingSlice(
-    framework::LoDTensor *tensor, PyObject *_index,
-    std::vector<int> *slice_axes, std::vector<int> *slice_starts,
-    std::vector<int> *slice_ends, std::vector<int> *slice_strides,
-    std::vector<int> *decrease_axis, std::vector<int> *none_axes,
-    std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
-    bool *list_select_flag) {
-  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
-  // types, and list of Bool and Integers.
-  // wrap to tuple
-
-  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
-  PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
-  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
-    if (!PyTuple_Check(_index)) {
-      Py_DECREF(index);
-      VLOG(4) << "Call Py_DECREF";
-    }
-  });
-  PADDLE_ENFORCE_EQ(
-      tensor->IsInitialized(), true,
-      platform::errors::InvalidArgument("tensor has not been initialized"));
-  const auto &shape = tensor->dims();
-  const int rank = shape.size();
-  const int size = PyTuple_GET_SIZE(index);
-
-  // specified_dims is the number of dimensions which indexed by Interger,
-  // Slices.
-  int specified_dims = 0;
-  int ell_count = 0;
-  for (int dim = 0; dim < size; ++dim) {
-    PyObject *slice_item = PyTuple_GetItem(index, dim);
-    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
-      specified_dims++;
-    } else if (slice_item == Py_Ellipsis) {
-      ell_count++;
-    }
-  }
-
-  PADDLE_ENFORCE_LE(ell_count, 1,
-                    platform::errors::InvalidArgument(
-                        "An index can only have a single ellipsis ('...')"));
-  int none_count = 0;
-  for (int i = 0, dim = 0; i < size; ++i) {
-    PyObject *slice_item = PyTuple_GetItem(index, i);
-
-    infer_flags->push_back(1);
-    int dim_len = shape[dim];
-    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
-      // integer, PyLong_AsLong supports both int and long
-      int start = static_cast<int>(PyLong_AsLong(slice_item));
-      auto s_t = start;
-      start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(start + 1);
-      slice_strides->push_back(1);
-      decrease_axis->push_back(dim);
-      dim++;
-    } else if (PySlice_Check(slice_item)) {
-      // slice item
-      Py_ssize_t start, end, step;
-      PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
-      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
-
-      // :: or : or 0:dim_len:1
-      if (start == 0 && end == dim_len && step == 1) {
-        dim++;
-        continue;
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(end);
-      slice_strides->push_back(step);
-      dim++;
-    } else if (slice_item == Py_Ellipsis) {
-      dim += rank - specified_dims;
-    } else if (slice_item == Py_None) {
-      none_axes->push_back(dim + none_count);
-      none_count++;
-    } else if (PyList_Check(slice_item)) {
-      *list_select_flag = true;
-      PADDLE_ENFORCE_EQ(
-          size, 1,
-          platform::errors::InvalidArgument(
-              "When index contains a list, its length is excepted to 1, "
-              "but received %d",
-              size));
-      bool all_bool = true;
-      int list_size = PyList_GET_SIZE(slice_item);
-      for (int j = 0; j < list_size; ++j) {
-        PyObject *list_item = PyList_GetItem(slice_item, j);
-        if (PyCheckInteger(list_item)) {
-          all_bool = false;
-        } else if (!PyBool_Check(list_item)) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support int or bool in index list."));
-        }
-      }
-      if (all_bool) {
-        PADDLE_ENFORCE_EQ(
-            list_size, shape[0],
-            platform::errors::InvalidArgument(
-                "The dimension of bool index doesn't match indexed array along "
-                "dimension 0, the target dimension is %d, but received %d.",
-                shape[0], list_size));
-
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (list_item == Py_True) {
-            list_select_idxs->push_back(j);
-          }
-        }
-      } else {
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (PyCheckInteger(list_item)) {
-            list_select_idxs->push_back(
-                static_cast<int>(PyLong_AsLong(list_item)));
-          } else if (list_item == Py_True) {
-            list_select_idxs->push_back(1);
-          } else {
-            list_select_idxs->push_back(0);
-          }
-        }
-      }
-
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, Tensor.__indices__() only allows indexing "
-          "by Integers, Slices, Ellipsis, None, tuples of these types "
-          "and list of Bool and Integers, but received "
-          "%s in %dth slice item",
-          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
-    }
-  }
-
-  // valid_index is the number of dimensions exclude None index
-  const int valid_indexs = size - none_axes->size() - ell_count;
-  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
-                    platform::errors::InvalidArgument(
-                        "Too many indices (%d) for tensor of dimension %d.",
-                        valid_indexs, rank));
-}
-
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
@@ -2321,6 +2074,26 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
+      .def("_get_kernel_signature",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs) {
+             // TODO(xiongkun): move this function outside of tracer.
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               auto to_vector = [](paddle::SmallVector<std::string> &vec) {
+                 return std::vector<std::string>(vec.begin(), vec.end());
+               };
+               auto ret = self.GetExpectedKernelSignature(type, ins_map,
+                                                          outs_map, attrs);
+               auto kernelsig_ins = to_vector(std::get<0>(ret.args));
+               auto kernelsig_attrs = to_vector(std::get<1>(ret.args));
+               auto kernelsig_outs = to_vector(std::get<2>(ret.args));
+               return std::make_tuple(kernelsig_ins, kernelsig_attrs,
+                                      kernelsig_outs);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index eafd5baab7d24cc63f0bce4fd479c0054d1d3659..b008308e27d9afaa9d8c47290489d50a762f2a41 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -33,6 +33,10 @@
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 namespace py = pybind11;
 
 namespace pybind11 {
@@ -556,6 +560,10 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
+      .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
+      .def("onnxruntime_enabled", &AnalysisConfig::use_onnxruntime)
+      .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
       .def("use_npu", &AnalysisConfig::use_npu)
@@ -658,7 +666,24 @@ void BindAnalysisConfig(py::module *m) {
              return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
            },
            py::return_value_policy::reference)
-      .def("nnadapter", &AnalysisConfig::NNAdapter);
+      .def("nnadapter", &AnalysisConfig::NNAdapter)
+      .def("set_dist_config", &AnalysisConfig::SetDistConfig)
+      .def("dist_config", &AnalysisConfig::dist_config);
+
+  py::class_<DistConfig>(*m, "DistConfig")
+      .def(py::init<>())
+      .def("set_carrier_id", &DistConfig::SetCarrierId)
+      .def("set_comm_init_config", &DistConfig::SetCommInitConfig)
+      .def("set_endpoints", &DistConfig::SetEndpoints)
+      .def("set_ranks", &DistConfig::SetRanks)
+      .def("enable_dist_model", &DistConfig::EnableDistModel)
+      .def("carrier_id", &DistConfig::carrier_id)
+      .def("current_endpoint", &DistConfig::current_endpoint)
+      .def("trainer_endpoints", &DistConfig::trainer_endpoints)
+      .def("nranks", &DistConfig::nranks)
+      .def("rank", &DistConfig::rank)
+      .def("comm_init_config", &DistConfig::comm_init_config)
+      .def("use_dist_model", &DistConfig::use_dist_model);
 }
 
 void BindLiteNNAdapterConfig(py::module *m) {
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 8283a249ded4c0c790add73573621252bc8954d8..8d78adaf5a4735d87e2206df6c8b55875db68118 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -44,29 +44,41 @@ int main(int argc, char **argv) {
   paddle::framework::InitDefaultKernelSignatureMap();
   auto &kernel_signature_map = phi::DefaultKernelSignatureMap::Instance();
   auto &kernel_factory = phi::KernelFactory::Instance();
-  std::cout << "{";
+  std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
     if (kernel_signature_map.Has(op_kernel_pair.first)) {
-      std::cout << "\"" << op_kernel_pair.first << "\":{";
+      kernel_signature_map_str =
+          kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
       auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
-      std::cout << "\"inputs\":[";
-      for (auto name : std::get<0>(args)) {
-        std::cout << "\"" << name << "\",";
+
+      kernel_signature_map_str += "\"inputs\":[";
+      auto inputs_ = std::get<0>(args);
+      for (size_t i = 0; i < inputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + inputs_[i] + "\",";
       }
-      if (std::get<0>(args).size() > 0) std::cout << "\b";
-      std::cout << "],\"attrs\":[";
-      for (auto name : std::get<1>(args)) {
-        std::cout << "\"" << name << "\",";
+      if (inputs_.size()) kernel_signature_map_str.pop_back();
+
+      kernel_signature_map_str += "],\"attrs\":[";
+      auto attrs_ = std::get<1>(args);
+      for (size_t i = 0; i < attrs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + attrs_[i] + "\",";
       }
-      if (std::get<1>(args).size() > 0) std::cout << "\b";
-      std::cout << "],\"outputs\":[";
-      for (auto name : std::get<2>(args)) {
-        std::cout << "\"" << name << "\",";
+      if (attrs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "],\"outputs\":[";
+      auto outputs_ = std::get<2>(args);
+      for (size_t i = 0; i < outputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + outputs_[i] + "\",";
       }
-      if (std::get<2>(args).size() > 0) std::cout << "\b";
-      std::cout << "]},";
+
+      if (outputs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "]},";
     }
   }
-  std::cout << "\b}" << std::endl;
+  kernel_signature_map_str.pop_back();
+  kernel_signature_map_str += "}\n";
+  std::cout << kernel_signature_map_str;
   return 0;
 }
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 2b07a439d33b4a96a10a893a95e0dd26f83dd8c7..9e86e3df8a6884ec1b75b8525ad858ff8f2e233c 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -30,8 +30,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
     {"bincount", {"X", "Weights"}},
     {"fused_attention",
-     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
-      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
+      "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
     {"merged_momentum",
      {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
-    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
+    {"sparse_momentum",
+     {"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
     {"run_program", {"X", "Params"}},
     {"fused_feedforward",
@@ -103,11 +104,16 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
-    {"fused_attention",
-     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
-      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
-      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
-      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
+    {"fused_attention", {"LnMean",         "LnVariance",
+                         "LnOut",          "QKVOut",
+                         "QKVBiasOut",     "TransposeOut2",
+                         "QKOut",          "QKTVOut",
+                         "SoftmaxOut",     "AttnDropoutMaskOut",
+                         "AttnDropoutOut", "SrcMaskOut",
+                         "FMHAOut",        "OutLinearOut",
+                         "DropoutMaskOut", "Ln2Mean",
+                         "Ln2Variance",    "BiasDropoutResidualOut",
+                         "CacheKVOut",     "Y"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
@@ -124,7 +130,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
     {"run_program", {"DOut"}},
     {"adam",
@@ -181,7 +187,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
       "out_old_num_accumulates", "out_num_updates"}},
     {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6e553ad2e60e292881fa8bb0294ea2a247656b67..1c5b30fe087f3636a6a10579651d2c6a77a42343 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -77,14 +78,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/lod_utils.h"
-#ifndef PADDLE_ON_INFERENCE
 #include "paddle/fluid/pybind/eager.h"
-#endif
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
@@ -103,7 +106,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/metrics_py.h"
@@ -173,6 +175,7 @@ namespace paddle {
 namespace pybind {
 
 PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_framework_scope_pytype = nullptr;
 PyTypeObject *g_cudaplace_pytype = nullptr;
 PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
@@ -529,9 +532,8 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
-#ifndef PADDLE_ON_INFERENCE
+  BindImperative(&m);
   BindEager(&m);
-#endif
   BindCudaStream(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
@@ -745,8 +747,6 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
-  BindImperative(&m);
-
   py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                  py::buffer_protocol());
   g_framework_tensor_pytype =
@@ -1353,7 +1353,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindReader(&m);
 
-  py::class_<Scope>(m, "_Scope", R"DOC(
+  py::class_<Scope> _Scope(m, "_Scope", R"DOC(
     Scope is an association of a name to Variable. All variables belong to Scope.
 
     Variables in a parent scope can be retrieved from local scope.
@@ -1373,7 +1373,9 @@ All parameter, weight, gradient are variables in Paddle.
           param_array = np.full((height, row_numel), 5.0).astype("float32")
           param.set(param_array, place)
 
-        )DOC")
+        )DOC");
+  g_framework_scope_pytype = reinterpret_cast<PyTypeObject *>(_Scope.ptr());
+  _Scope
       .def("_remove_from_pool",
            [](Scope &self) { ScopePool::Instance().Remove(&self); })
       .def("var",
@@ -1673,7 +1675,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_all_device_type", []() {
     std::vector<std::string> device_types;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    device_types = platform::DeviceManager::GetAllDeviceTypes();
+    device_types = phi::DeviceManager::GetAllDeviceTypes();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_all_device_type because you have installed"
@@ -1687,7 +1689,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_all_custom_device_type", []() {
     std::vector<std::string> device_types;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_all_custom_device_type because you have installed"
@@ -1701,7 +1703,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_available_device", [] {
     std::vector<std::string> devices;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    devices = platform::DeviceManager::GetAllDeviceList();
+    devices = phi::DeviceManager::GetAllDeviceList();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_available_device because you have installed"
@@ -1715,7 +1717,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_available_custom_device", [] {
     std::vector<std::string> devices;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    devices = platform::DeviceManager::GetAllCustomDeviceList();
+    devices = phi::DeviceManager::GetAllCustomDeviceList();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_available_custom_device because you have "
@@ -1752,10 +1754,10 @@ All parameter, weight, gradient are variables in Paddle.
                std::exit(-1);
              }
 
-             if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) &&
-                        platform::DeviceManager::IsCustom(device_type))) {
+             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
+                        phi::DeviceManager::IsCustom(device_type))) {
                int dev_count = static_cast<int>(
-                   platform::DeviceManager::GetDeviceCount(device_type));
+                   phi::DeviceManager::GetDeviceCount(device_type));
                if (UNLIKELY(dev_id >= dev_count)) {
                  if (dev_count == 0) {
                    LOG(ERROR) << "Cannot use " << device_type
@@ -1955,10 +1957,17 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
   m.def("get_xpu_device_op_support_types",
         [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
           return platform::get_xpu_op_support_type(op_name, version);
         });
+#endif
   m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
@@ -2917,6 +2926,88 @@ All parameter, weight, gradient are variables in Paddle.
   });
 
   m.def("size_of_dtype", framework::SizeOfType);
+  py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
+      .def(py::init<>())
+      .def("get_data", &paddle::platform::ProfilerResult::GetData,
+           py::return_value_policy::automatic_reference)
+      .def("save", &paddle::platform::ProfilerResult::Save)
+      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
+
+  py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
+      .def_readwrite("type", &paddle::platform::DevicePythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::DevicePythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::DevicePythonNode::end_ns)
+      .def_readwrite("device_id",
+                     &paddle::platform::DevicePythonNode::device_id)
+      .def_readwrite("context_id",
+                     &paddle::platform::DevicePythonNode::context_id)
+      .def_readwrite("stream_id",
+                     &paddle::platform::DevicePythonNode::stream_id);
+
+  py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::HostPythonNode::name)
+      .def_readwrite("type", &paddle::platform::HostPythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::HostPythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::HostPythonNode::end_ns)
+      .def_readwrite("process_id",
+                     &paddle::platform::HostPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("children_node",
+                     &paddle::platform::HostPythonNode::children_node_ptrs)
+      .def_readwrite("runtime_node",
+                     &paddle::platform::HostPythonNode::runtime_node_ptrs)
+      .def_readwrite("device_node",
+                     &paddle::platform::HostPythonNode::device_node_ptrs);
+
+  py::class_<paddle::platform::Profiler>(m, "_Profiler")
+      .def("create", &paddle::platform::Profiler::Create,
+           py::return_value_policy::take_ownership)
+      .def("prepare",
+           [](paddle::platform::Profiler *profiler) {
+             platform::EnableHostEventRecorder();
+             profiler->Prepare();
+           })
+      .def("start", &paddle::platform::Profiler::Start)
+      .def("stop",
+           [](paddle::platform::Profiler *profiler) {
+             platform::DisableHostEventRecorder();
+             return profiler->Stop();
+           },
+           py::return_value_policy::automatic_reference);
+
+  py::class_<paddle::platform::ProfilerOptions>(m, "ProfilerOptions")
+      .def(py::init<>())
+      .def_readwrite("trace_switch",
+                     &paddle::platform::ProfilerOptions::trace_switch);
+
+  py::class_<platform::RecordEvent>(m, "_RecordEvent")
+      .def(py::init([](std::string name, platform::TracerEventType type) {
+        return std::make_unique<platform::RecordEvent>(
+            name, type, 1, paddle::platform::EventRole::kOrdinary);
+      }))
+      .def("end", [](platform::RecordEvent *event) { event->End(); });
+
+  py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
+      .value("Operator", paddle::platform::TracerEventType::Operator)
+      .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
+      .value("ProfileStep", paddle::platform::TracerEventType::ProfileStep)
+      .value("CudaRuntime", paddle::platform::TracerEventType::CudaRuntime)
+      .value("Kernel", paddle::platform::TracerEventType::Kernel)
+      .value("Memcpy", paddle::platform::TracerEventType::Memcpy)
+      .value("Memset", paddle::platform::TracerEventType::Memset)
+      .value("UserDefined", paddle::platform::TracerEventType::UserDefined)
+      .value("OperatorInner", paddle::platform::TracerEventType::OperatorInner)
+      .value("Forward", paddle::platform::TracerEventType::Forward)
+      .value("Backward", paddle::platform::TracerEventType::Backward)
+      .value("Optimization", paddle::platform::TracerEventType::Optimization)
+      .value("Communication", paddle::platform::TracerEventType::Communication)
+      .value("PythonOp", paddle::platform::TracerEventType::PythonOp)
+      .value("PythonUserDefined",
+             paddle::platform::TracerEventType::PythonUserDefined);
+  m.def("load_profiler_result", &paddle::platform::LoadProfilerResult);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
@@ -3445,6 +3536,31 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_elewise_add_act_ops = True
                      )DOC")
+      .def_property(
+          "fuse_gemm_epilogue",
+          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_gemm_epilogue_ = b;
+          },
+          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
+                to fuse matmul_op, elemenewist_add_op and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_gemm_epilogue = True
+                     )DOC")
       .def_property(
           "fuse_bn_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
@@ -3919,6 +4035,8 @@ All parameter, weight, gradient are variables in Paddle.
              }
              return res;
            })
+      .def("get_all_option_names",
+           &platform::ipu::IpuStrategy::GetAllOptionNames)
       .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern)
       .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern)
       .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled);
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a037fa13eb53b94fd8d82413dad55d7f34b0006d
--- /dev/null
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -0,0 +1,294 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+static bool PyCheckTensor(PyObject* obj);
+static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
+// Slice related methods
+static bool PyCheckInteger(PyObject* obj) {
+#if PY_VERSION_HEX < 0x03000000
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+
+static bool IsNumpyType(PyObject* obj) {
+  // It is not a good way to judge the type of obj by its type'name. Maybe using
+  // `PyArray_IsScalar` will be better. However, this interface cannot be used
+  // by including pybind11, and it needs to compile with numpy.
+  auto type_name = std::string(Py_TYPE(obj)->tp_name);
+  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
+         type_name == "numpy.int32" || type_name == "numpy.int16";
+}
+
+static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
+  if (tensor.numel() == 1) {
+    if (framework::TransToProtoVarType(tensor.type()) ==
+        framework::proto::VarType::INT32) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
+    } else if (framework::TransToProtoVarType(tensor.type()) ==
+               framework::proto::VarType::INT64) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, the type of tensor in slice indices only allows "
+          "int32 and int64, please check the type of index tensor."));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Currently, tensor in slice indices only allows 1 element, "
+        "but received %d.",
+        tensor.numel()));
+  }
+}
+
+// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
+// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
+// Original PySlice_GetIndices return wrong result when
+// slice_item contains long int, such as arr[:180L].
+// NOT sure why this happens !!!
+// Besides, PySlice_GetIndices cannot raise error when float in slice item.
+// So, I make a revised version of PySlice_GetIndices, named to
+// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
+// PySlice_GetIndices in the future.
+static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
+                               Py_ssize_t* start, Py_ssize_t* stop,
+                               Py_ssize_t* step) {
+  /* XXX support long ints */
+  if (r->step == Py_None) {
+    *step = 1;
+  } else {
+    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
+      *step = PyLong_AsLong(r->step);
+    } else if (PyCheckTensor(r->step)) {
+      *step = GetSliceIndexFromPyObject(r->step);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->step)->tp_name)));
+    }
+  }
+  if (r->start == Py_None) {
+    *start = *step < 0 ? length - 1 : 0;
+  } else {
+    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
+      *start = PyLong_AsLong(r->start);
+    } else if (PyCheckTensor(r->start)) {
+      *start = GetSliceIndexFromPyObject(r->start);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->start)->tp_name)));
+    }
+    if (*start < 0) *start += length;
+    *start = std::max(*start, static_cast<Py_ssize_t>(0));
+  }
+  if (r->stop == Py_None) {
+    *stop = *step < 0 ? -1 : length;
+  } else {
+    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
+      *stop = PyLong_AsLong(r->stop);
+    } else if (PyCheckTensor(r->stop)) {
+      *stop = GetSliceIndexFromPyObject(r->stop);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->stop)->tp_name)));
+    }
+    if (0 < *step && *stop < 0) *stop += length;
+    *stop = std::min(*stop, length);
+  }
+  if (*stop > length) return -1;
+  if (*start >= length) return -1;
+  if (*step == 0) return -1;
+  return 0;
+}
+
+static void ParseIndexingSlice(
+    framework::LoDTensor* tensor, PyObject* _index,
+    std::vector<int>* slice_axes, std::vector<int>* slice_starts,
+    std::vector<int>* slice_ends, std::vector<int>* slice_strides,
+    std::vector<int>* decrease_axis, std::vector<int>* none_axes,
+    std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
+    bool* list_select_flag) {
+  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
+  // types, and list of Bool and Integers.
+  // wrap to tuple
+
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
+  PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("tensor has not been initialized"));
+  const auto& shape = tensor->dims();
+  const int rank = shape.size();
+  const int size = PyTuple_GET_SIZE(index);
+
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
+  int ell_count = 0;
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index, dim);
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    } else if (slice_item == Py_Ellipsis) {
+      ell_count++;
+    }
+  }
+
+  PADDLE_ENFORCE_LE(ell_count, 1,
+                    platform::errors::InvalidArgument(
+                        "An index can only have a single ellipsis ('...')"));
+  int none_count = 0;
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject* slice_item = PyTuple_GetItem(index, i);
+
+    infer_flags->push_back(1);
+    int dim_len = shape[dim];
+    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
+      // integer, PyLong_AsLong supports both int and long
+      int start = static_cast<int>(PyLong_AsLong(slice_item));
+      auto s_t = start;
+      start = start < 0 ? start + dim_len : start;
+      if (start >= dim_len || start < 0) {
+        std::string str_error_message =
+            "The starting index " + std::to_string(s_t) +
+            " of slice is out of bounds in tensor " + std::to_string(dim) +
+            "-th axis, it shound be in the range of [" +
+            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
+        // py::index_error is corresponding to IndexError in Python
+        // Used to indicate out of bounds access in __getitem__, __setitem__
+        throw py::index_error(str_error_message);
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(start + 1);
+      slice_strides->push_back(1);
+      decrease_axis->push_back(dim);
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
+      // slice item
+      Py_ssize_t start, end, step;
+      PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
+      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
+
+      // :: or : or 0:dim_len:1
+      if (start == 0 && end == dim_len && step == 1) {
+        dim++;
+        continue;
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(end);
+      slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else if (slice_item == Py_None) {
+      none_axes->push_back(dim + none_count);
+      none_count++;
+    } else if (PyList_Check(slice_item)) {
+      *list_select_flag = true;
+      PADDLE_ENFORCE_EQ(
+          size, 1,
+          platform::errors::InvalidArgument(
+              "When index contains a list, its length is excepted to 1, "
+              "but received %d",
+              size));
+      bool all_bool = true;
+      int list_size = PyList_GET_SIZE(slice_item);
+      for (int j = 0; j < list_size; ++j) {
+        PyObject* list_item = PyList_GetItem(slice_item, j);
+        if (PyCheckInteger(list_item)) {
+          all_bool = false;
+        } else if (!PyBool_Check(list_item)) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support int or bool in index list."));
+        }
+      }
+      if (all_bool) {
+        PADDLE_ENFORCE_EQ(
+            list_size, shape[0],
+            platform::errors::InvalidArgument(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension 0, the target dimension is %d, but received %d.",
+                shape[0], list_size));
+
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (list_item == Py_True) {
+            list_select_idxs->push_back(j);
+          }
+        }
+      } else {
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (PyCheckInteger(list_item)) {
+            list_select_idxs->push_back(
+                static_cast<int>(PyLong_AsLong(list_item)));
+          } else if (list_item == Py_True) {
+            list_select_idxs->push_back(1);
+          } else {
+            list_select_idxs->push_back(0);
+          }
+        }
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, Tensor.__indices__() only allows indexing "
+          "by Integers, Slices, Ellipsis, None, tuples of these types "
+          "and list of Bool and Integers, but received "
+          "%s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
+    }
+  }
+
+  // valid_index is the number of dimensions exclude None index
+  const int valid_indexs = size - none_axes->size() - ell_count;
+  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
+                    platform::errors::InvalidArgument(
+                        "Too many indices (%d) for tensor of dimension %d.",
+                        valid_indexs, rank));
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index e7abd64ec4439611c307440597c7278cabb03ab9..c593c7df3e0ec708beecfd6c5051637d65a7f79d 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -393,10 +393,10 @@ void SetTensorFromPyArrayT(
   } else if (paddle::platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
     platform::Place tmp_place = place;
-    platform::DeviceGuard guard(tmp_place);
+    phi::DeviceGuard guard(tmp_place);
     auto dst = self->mutable_data<T>(place);
 
-    platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
+    phi::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
         reinterpret_cast<void *>(dst),
         const_cast<void *>(reinterpret_cast<const void *>(array.data())),
         array.nbytes());
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index f2768f3dfa88d3405008baa7662f5e209ca3954c..ed29b5b44c7791d356ec1283a0027cacf1fd5e7a 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -95,9 +95,7 @@ set(infrt_mlir_incs
         dense_tensor_inc
         pd_ops_inc
         pd_extra_ops_inc
-        rewrite_inc
         trt_ops_inc
-        pd_lower_to_trt_inc
         )
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 28f63db49f4baec12bb43afa9034d5578d9f6cb1..e0488117783d5657aa97c301d9d12ce1c77017e7 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -24,6 +24,7 @@
 
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -41,7 +42,6 @@
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using namespace infrt::tensor;        // NOLINT
-using infrt::dt::TensorMapType;       // NOLINT
 
 namespace infrt {
 
@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
       auto arg = predict_func.getArgument(i);
       auto type = arg.getType();
       // this param is TensorMap
-      if (type.isa<TensorMapType>()) {
+      if (type.isa<infrt::DenseTensorMapType>()) {
         auto* value = new host_context::Value(std::move(*map));
         arguments_.push_back(value);
         AddValue(predict_func.getArgument(i), value);
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 9d0e3bc4fbb3158147283c1992cf1fee70c9b90d..5713fdbbaf82b2ea2190d2ee1b1dc5d944f2c262 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace infrt {
@@ -20,6 +21,14 @@ class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
+
+  CpuPhiContext() {
+    Init();
+    SetAllocator(alloc_.get());
+  }
+
+ private:
+  std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 54b7bc3e8af835077fcd2ac00d33b15e4ae3f95c..12cf14060e27c1d58e3fd9b14cc12b3c1f7f8907 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -17,8 +17,8 @@
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -86,7 +86,7 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
 TEST(trt, run_static) {
-  TRTEngine static_trt_engine(0);
+  TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
       static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
   BuildOptions static_build_options;
@@ -164,7 +164,7 @@ TEST(trt, run_static) {
 }
 
 TEST(trt, run_dynamic) {
-  TRTEngine engine(0);
+  TrtEngine engine(0);
   auto net = ConstructNetwork(
       engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
   BuildOptions build_options;
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a204fe42b45080b0ba5526473622f34e4fe4ef41..232653e8c41f71fd9bb32c9eac302b047d122b66 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -17,7 +17,7 @@
 
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
+#include <glog/logging.h>
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
@@ -40,26 +40,26 @@ static nvinfer1::IRuntime* createInferRuntime(
       phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
 }
 
-TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
   FreshDeviceId();
   logger_.reset(new TrtLogger());
   builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
   phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
 }
 
-nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
   CHECK_NOTNULL(builder_);
   return builder_.get();
 }
 
-void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+void TrtEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
                       const BuildOptions& build_options) {
   FreshDeviceId();
   ModelToBuildEnv(std::move(network), build_options);
   CHECK_NOTNULL(engine_);
 }
 
-bool TRTEngine::ModelToBuildEnv(
+bool TrtEngine::ModelToBuildEnv(
     TrtUniquePtr<nvinfer1::INetworkDefinition> network,
     const BuildOptions& build) {
   CHECK_NOTNULL(builder_);
@@ -70,7 +70,7 @@ bool TRTEngine::ModelToBuildEnv(
   return true;
 }
 
-bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+bool TrtEngine::NetworkToEngine(const BuildOptions& build) {
   TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
   CHECK_NOTNULL(config);
   CHECK(SetupNetworkAndConfig(build, *network_, *config));
@@ -91,7 +91,7 @@ bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
   return true;
 }
 
-bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
                                       INetworkDefinition& network,
                                       IBuilderConfig& config) {
   builder_->setMaxBatchSize(build.max_batch);
@@ -235,7 +235,7 @@ bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
-bool TRTEngine::SetUpInference(
+bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
     const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
     std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
@@ -261,7 +261,7 @@ bool TRTEngine::SetUpInference(
   return true;
 }
 
-void TRTEngine::Run(const phi::GPUContext& ctx) {
+void TrtEngine::Run(const phi::GPUContext& ctx) {
   if (is_dynamic_shape_) {
     DynamicRun(ctx);
   } else {
@@ -269,7 +269,7 @@ void TRTEngine::Run(const phi::GPUContext& ctx) {
   }
 }
 
-void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -303,7 +303,7 @@ void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
       runtime_batch, buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -339,14 +339,14 @@ void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
   contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::FreshDeviceId() {
+void TrtEngine::FreshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
   CHECK_LT(device_id_, count);
   phi::backends::gpu::SetDeviceId(device_id_);
 }
 
-void TRTEngine::GetEngineInfo() {
+void TrtEngine::GetEngineInfo() {
 #if IS_TRT_VERSION_GE(8200)
   LOG(INFO) << "====== engine info ======";
   std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index f72bdaf3ac0b463d086e9aeda62823cc725f2db9..3c8243e3c3838e30eb70877f8c82d623c103eaff 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -56,13 +56,18 @@ using namespace nvinfer1;  // NOLINT
 //
 // We have encapsulated this logic, please use the following programming model.
 //
-// TRTEngine trt_engine;
+// TrtEngine trt_engine;
 // trt_engine.Build(...);
 // trt_engine.SetUpInference(...);
 // trt_engine.Run(...);
-class TRTEngine {
+class TrtEngine {
  public:
-  explicit TRTEngine(int device_id);
+  explicit TrtEngine(int device_id = 0);
+
+  TrtEngine(const TrtEngine&) = delete;
+  TrtEngine& operator=(const TrtEngine&) = delete;
+  TrtEngine(TrtEngine&&) = default;
+  TrtEngine& operator=(TrtEngine&&) = default;
 
   nvinfer1::IBuilder* GetTrtBuilder();
 
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
index 4b129af1d53810c6d37d23270c1118023ae7b3f6..c66a850ffb1cc23a24074cbedaed62f7ec87beec 100644
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -15,16 +15,17 @@
 
 #pragma once
 
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include <glog/logging.h>
+
 #include <algorithm>
 #include <cassert>
 #include <functional>
 #include <memory>
 #include <unordered_map>
 
-#include <NvInfer.h>
-#include <NvInferRuntime.h>
-#include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index aadc146e36280f79902f3b9ed90f3203fb9e5384..89d8cd65b85cd39c9eb50edca1aa1bfaf47073a4 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -111,25 +111,13 @@ def PrintI64Op : PrintOp<"i64", I64>;
 def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
-def GetStringOp : INFRT_Op<"get_string"> {
-  let summary = "Infrt.get_string";
-  let description = [{
-    Get a !infrt.string value from the given string attribute.
-  }];
-
-  let arguments = (ins StrAttr:$value);
-  let results = (outs StringType);
-  let assemblyFormat = "`(` $value `)` attr-dict";
-  let verifier = ?;
-}
-
 def PrintStringOp : INFRT_Op<"print_string"> {
   let summary = "Infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
 
-  let arguments = (ins StringType:$input);
+  let arguments = (ins StrAttr:$input);
   let results = (outs);
   let assemblyFormat = "`(` $input `)` attr-dict";
   let verifier = ?;
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 49d6887ada0322065946f95c9e39d932f268375e..7b8d48ff7164765a1949659838e541068fdee4c4 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -38,23 +38,6 @@ void DTDialect::initialize() {
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
-
-TensorMapType TensorMapType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-TensorMapType TensorMapType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
-StringType StringType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-StringType StringType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
 static mlir::Type getTensorType(mlir::MLIRContext *context) {
   auto t_dialect = mlir::Identifier::get("t", context);
   return mlir::OpaqueType::get(t_dialect, "tensor");
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index b0a1ea412c53eb677fed1a1b76e704f3f3da11e5..27febffe8156379c63a0b6b3fb048f7441255f0e 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -21,27 +21,6 @@
 
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
-namespace infrt {
-namespace dt {
-class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
-                                                  mlir::Type,
-                                                  mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static TensorMapType get();
-  static TensorMapType get(mlir::MLIRContext *context);
-};
-
-class StringType
-    : public mlir::Type::TypeBase<StringType, mlir::Type, mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static StringType get();
-  static StringType get(mlir::MLIRContext *context);
-};
-}  // namespace dt
-}  // namespace infrt
-
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 7e6e838a72372d2f850d4fb37f6b2218577ba0ed..f5db90648eec9933eadf897a8090260bdbfe575b 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -105,11 +105,10 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
   }];
 
   // input path of model params.
-  let arguments = (ins StringType:$path);
-  let results = (outs TensorMapType);
+  let arguments = (ins StrAttr:$path);
+  let results = (outs DenseTensorMap:$out);
 
-  let assemblyFormat = "`(` operands `)` attr-dict";
-  let verifier = ?;
+  let assemblyFormat = "`(``)`attr-dict";
 }
 
 
@@ -122,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 
   // input path of model params.
   let arguments = (ins
-          TensorMapType:$map,
+          DenseTensorMap:$map,
           StrAttr:$name
           );
   let results = (outs DenseTensor:$output);
@@ -137,7 +136,7 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
     An operation that get the size of a TensorMap.
   }];
 
-  let arguments = (ins TensorMapType:$map);
+  let arguments = (ins DenseTensorMap:$map);
   let results = (outs I32:$size);
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index daf710e0baf54549a2cc3e7a6e87c7b76a169f29..08ce2d4707bfdc8498610793437675ae8238475e 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -13,3 +13,5 @@ mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
 mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
 add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
 add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common_type.cc
index 5cbd7b2cd6153f3724bc357811bdb0894eeb64ba..00684c505268c09e97d262a3526c946d1bc3095c 100644
--- a/paddle/infrt/dialect/infrt/common_type.cc
+++ b/paddle/infrt/dialect/infrt/common_type.cc
@@ -43,46 +43,49 @@ llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
     return llvm::None;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) {
+llvm::StringRef GetString(TargetType type) {
+  llvm::StringRef str;
   switch (type) {
     case (TargetType::CPU):
-      os << "CPU";
+      str = "CPU";
       break;
     case (TargetType::GPU):
-      os << "GPU";
+      str = "GPU";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) {
+llvm::StringRef GetString(LayoutType type) {
+  llvm::StringRef str;
   switch (type) {
     case (LayoutType::NCHW):
-      os << "NCHW";
+      str = "NCHW";
       break;
     case (LayoutType::NHWC):
-      os << "NHWC";
+      str = "NHWC";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) {
+llvm::StringRef GetString(PrecisionType type) {
+  llvm::StringRef str;
   switch (type) {
     case (PrecisionType::FLOAT32):
-      os << "FP32";
+      str = "FP32";
       break;
     case (PrecisionType::FLOAT16):
-      os << "FP16";
+      str = "FP16";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h
index d6d6503c03be5722cf398c8abac4485aae5d9a8c..2ebe2b8ccdba6943d81e46ec747144fd0835d7e0 100644
--- a/paddle/infrt/dialect/infrt/common_type.h
+++ b/paddle/infrt/dialect/infrt/common_type.h
@@ -21,8 +21,22 @@
 namespace infrt {
 
 enum class TargetType : uint8_t { CPU, GPU, UNK };
-enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK };
-enum class LayoutType : uint8_t { NCHW, NHWC, UNK };
+enum class LayoutType : uint8_t { NCHW, NHWC, ANY, UNK };
+enum class PrecisionType : uint8_t {
+  UINT8,
+  INT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  BFLOAT16,
+  FLOAT32,
+  FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
+  BOOL,
+  UNK
+};
 
 struct Place {
   TargetType target;
@@ -40,8 +54,13 @@ llvm::Optional<TargetType> GetTargetType(llvm::StringRef key);
 llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key);
 llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key);
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type);
+llvm::StringRef GetString(TargetType type);
+llvm::StringRef GetString(LayoutType type);
+llvm::StringRef GetString(PrecisionType type);
 
+template <typename T>
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, T type) {
+  os << GetString(type);
+  return os;
+}
 }  // end namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc
index abb60016f90233cae68dc99e95885042517e9212..400e4921c944491e0ce8cded38fec9435f4ad0bd 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/DialectImplementation.h>
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index 00f94805c7db22e170c7395598bfe647174339c1..16ade66d47b8ee538a6e7c4f19bf571a25c3e416 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -17,3 +17,25 @@ def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
                        OptionalAttr<DictionaryAttr>:$attrs);
   let results = (outs Variadic<AnyType>);
 }
+
+def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
+def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
+  let summary = "convert tensor type op";
+  let description = [{convert tensor type op!}];
+  let arguments = (ins AnyType:$input);
+  let results = (outs AnyType:$output);
+}
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index 81d3d028a66bea29dd9a373e1905ac02468251fd..3190c1c84b8c04ceb7e91d829865c65503f5d708 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -10,16 +10,59 @@ def Infrt_Dialect : Dialect {
 
   let name = "infrt";
   let cppNamespace = "::infrt";
+  let useDefaultAttributePrinterParser = 1;
 }
 
 // Type definitions
-
 // Base class for Infrt dialect types.
 class Infrt_Type<string name, list<Trait> traits = [],
                    string baseCppClass = "::mlir::Type">
     : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
 }
 
+class Infrt_EnumParam<string cppEnumType, string stringToSymbolFnName,
+  string symbolToStringFnName, string desc = ""> : TypeParameter<cppEnumType, desc> {
+  let parser = [{[&]() -> ::mlir::FailureOr<}] # cppEnumType # [{> {
+    ::llvm::StringRef enumKeyword;
+    if (::mlir::failed($_parser.parseKeyword(&enumKeyword)))
+      return ::mlir::failure();
+    auto maybeEnum = }] # stringToSymbolFnName # [{(enumKeyword);
+    if (maybeEnum)
+      return *maybeEnum;
+    llvm_unreachable("}] # cppEnumType # [{ can not be found.");
+    return {};
+  }()}];
+  let printer = "$_printer << " # symbolToStringFnName # "($_self)";
+}
+
+def TargetParam : Infrt_EnumParam<"::infrt::TargetType", "GetTargetType", "GetString">;
+def PrecisionParam : Infrt_EnumParam<"::infrt::PrecisionType", "GetPrecisionType", "GetString">;
+def LayoutParam : Infrt_EnumParam<"::infrt::LayoutType", "GetLayoutType", "GetString">;
+
+def TargetAttr : AttrDef<Infrt_Dialect, "Target"> {
+  let mnemonic = "target";
+  let parameters = (ins
+    TargetParam:$target
+  );
+  let assemblyFormat = "`<` $target `>`";
+}
+
+def PrecisionAttr : AttrDef<Infrt_Dialect, "Precision"> {
+  let mnemonic = "precision";
+  let parameters = (ins
+    PrecisionParam:$precision
+  );
+  let assemblyFormat = "`<` $precision `>`";
+}
+
+def LayoutAttr : AttrDef<Infrt_Dialect, "Layout"> {
+  let mnemonic = "layout";
+  let parameters = (ins
+    LayoutParam:$layout
+  );
+  let assemblyFormat = "`<` $layout `>`";
+}
+
 def LoDTensor : Infrt_Type<"LoDTensor"> {
   let summary = "infrt lod tensor";
   let description = [{lod_tensor<3x64x3x3xf32, 3>}];
@@ -34,12 +77,24 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   let summary = "infrt dense tensor";
   let description = [{dense_tensor<, 3>}];
   let parameters = (ins
-    "TargetType":$target,
-    "PrecisionType":$precision,
-    "LayoutType":$layout
+    "::infrt::TargetType":$target,
+    "::infrt::PrecisionType":$precision,
+    "::infrt::LayoutType":$layout
   );
 }
 
+def DenseTensorMap :  Infrt_Type<"DenseTensorMap"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
+// Type Constrait for concrete DenseTensor type.
+class DenseTensor<string target, string precision, string layout> :
+    Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
+    "!infrt.DenseTensor<"#target#","#precision#","#layout#">", 
+    "::infrt::DenseTensorType">;
+
 // Base class for infrt dialect attributes.
 class Infrt_Attr<string name, list<Trait> traits = [],
                    string baseCppClass = "::mlir::Attribute">
diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19c12251a2e6b4a71211fe88772d3b6759164c71
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(infrt_op_fuse)
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
new file mode 100644
index 0000000000000000000000000000000000000000..ef702650b6f1bbd3615ca7a70880d3c2c04e254b
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -0,0 +1,23 @@
+#ifndef INFRT_OP_FUSE
+#define INFRT_OP_FUSE
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt/infrt_ops.td"
+include "paddle/infrt/dialect/pd_ops.td"
+
+def FuseCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
+       (Infrt_CvtTensorOp $arg)>;
+
+def FuseFeedCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+       (PD_FeedOp $name)>;
+
+def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
+def RedundantCvtTensorOptPattern : Pat<
+  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+
+
+#endif // INFRT_OP_FUSE
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb16e054418b3b2c6ff843fdaf464d24a42249c2
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+namespace {
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * infrtOpFusePass.
+ */
+struct InfrtOpFusePass
+    : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+  void runOnFunction() override;
+};
+// Implementation of the InfrtOpFusePass.
+void InfrtOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  // Fuse pd.return Operation
+  auto terminator_op = getFunction().front().getTerminator();
+  if (nullptr == terminator_op) return;
+  for (auto operand : terminator_op->getOperands()) {
+    auto *op1 = operand.getDefiningOp();
+    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    if (!cvt_op) continue;
+    mlir::Value value = cvt_op.input();
+    operand.replaceAllUsesWith(value);
+    cvt_op.erase();
+  }
+}
+}  // namespace
+std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
+  return std::make_unique<InfrtOpFusePass>();
+}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
similarity index 51%
rename from paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
rename to paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
index 2de647df8b182baf7b0361bbc9e6e46cfc3a7489..ef349a7bbc4c6531bb7c02cb69a1bd5c427af080 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
+#pragma once
+#include <mlir/Pass/Pass.h>
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod,
-    ops::ReduceCudaKernel<float, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MulFunctor, kps::IdentityFunctor>);
+namespace infrt {
+/*
+ * infrtOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> createInfrtOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index 8c595c06745f1be8453c4d1f08ba00f4d9ceaf90..e951762abb20c232232af66d6bf1f2e7568a763b 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -25,10 +25,6 @@ namespace dialect {
 void INFRTDialect::initialize() {
   allowUnknownTypes();
   allowUnknownOperations();
-
-  addTypes<infrt::dt::StringType>();
-  addTypes<infrt::dt::TensorMapType>();
-
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/basic_kernels.cpp.inc"
@@ -43,14 +39,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
   llvm::StringRef keyword;
   if (parser.parseKeyword(&keyword)) return mlir::Type();
   // parse TensorMapType, for example: !infrt.tensor_map
-  if (keyword == "tensor_map") {
-    return infrt::dt::TensorMapType::get();
-  }
-  // parse StringType, for example: !infrt.string
-  if (keyword == "string") {
-    return infrt::dt::StringType::get();
-  }
-
   parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
       << keyword;
   return mlir::Type();
@@ -59,15 +47,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
 void INFRTDialect::printType(mlir::Type type,
                              mlir::DialectAsmPrinter &printer) const {
   // print TensorMapType, for example: !infrt.tensor_map
-  if (type.isa<infrt::dt::TensorMapType>()) {
-    printer << "tensor_map";
-    return;
-  }
-  // print StringType, for example: !infrt.string
-  if (type.isa<infrt::dt::StringType>()) {
-    printer << "string";
-    return;
-  }
   llvm_unreachable("unknown infrt type.");
 }
 
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 0f50eb2d8fb4ac83578f13888d05188a9143382f..45e6b116f489709b1d854727870010c7545d92e7 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -14,15 +14,6 @@ def INFRT_Dialect : Dialect {
   let cppNamespace = "::infrt::dialect";
 }
 
-// Type definitions
-def StringType :
-    Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
-    BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
-
-def TensorMapType :
-    Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
-    BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
-
 def BufferType : OpaqueType<"b", "buffer", "buffer">;
 
 class INFRT_createI32Attr<string value> : NativeCodeCall<
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index b5b8de7a20d0866802b8ce72e12dd7ed35dccbd1..5eae01719361dd5bc21c139b54cbcf16f226b4cc 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -21,8 +21,10 @@
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
@@ -34,6 +36,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   mlir::pd::PaddleDialect,
 #ifdef INFRT_WITH_PHI
                   phi::PHIDenseTensorDialect,
+                  phi::PHICPUKernelDialect,
+                  phi::PHIGPUKernelDialect,
                   phi::PHIDialect
 #endif
                   >();
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 1d0696e77dcda612eeb8c367958e44e2efed5354..e9bfc2eddb772168548141a3e86df977f57276bf 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -63,6 +63,7 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   mlir::DialectRegistry registry;
   registerCinnDialects(registry);
   context->appendDialectRegistry(registry);
+  context->loadAllAvailableDialects();
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index 266bdf60de788df0507a5bf0ef679945cb7c2abc..26425e3945caa2f85547b7b8e8be7dbeaf10e630 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -75,7 +75,7 @@ def PD_ElementType : Type<Or<[PD_Float.predicate,
 // def PD_Tensor : TensorOf<[PD_ElementType]>;
 def PD_Tensor1 : TensorOf<[PD_ElementType]>;
 
-def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">;
+def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor, DenseTensor],"pd.ttype">;
 
 def PD_Tensor_Array : VectorOf<[PD_Tensor]>;
 
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 338b04e001320289b71f6127318e7a073cefcacf..55ab174fcaf059d81f83e54e8f1e5864ef25b7e3 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -16,7 +16,6 @@
 
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/PatternMatch.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index b48c68060d42efdeecccd68abc5efb3045643370..41dd2ddd94eb161735568170a9a8bdc2ec259cdf 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
 namespace mlir {
 namespace pd {
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index 626b02c1f790d0a7f38887be33dace1c773a2cb1..4e73a533d99a79168b3e68b88d917f48ec811444 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -2,17 +2,14 @@ if (NOT INFRT_WITH_PHI)
     return()
 endif()
 
-#mlir_tablegen_on(infrt_phi_base DIALECT phi)
-add_mlir_dialect(infrt_phi_base phi)
-add_mlir_dialect(infrt_phi_tensor phi_dt)
-add_mlir_dialect(infrt_phi_kernel phi_kernel)
-#mlir_tablegen_on(infrt_phi_tensor)
-
-gather_srcs(infrt_src SRCS
-    phi_base.cc infrt_phi_tensor.cc
-    infrt_phi_tensor.cc)
-
+add_subdirectory(ir)
 add_subdirectory(pass)
 
+add_executable(phi-ir-exec phi_ir_exec.cc)
+target_link_libraries(phi-ir-exec infrt)
+
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
+
+gather_srcs(infrt_src SRCS
+    data_type.cc)
diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5da7ec8831258e52dd157ff444ffcd6e7930e1bb
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/data_type.h"
+
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType cvtTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
+  switch (precision) {
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+#undef CONVERT_PRECISION_TO_PHI
+}
+
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
+  switch (datatype) {
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
+    default:
+      return PrecisionType::UNK;
+  }
+#undef CONVERT_PRECISION_FROM_PHI
+}
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey cvtPlace2Phi(const Place& place) {
+  return phi::KernelKey(cvtTarget2Phi(place.target),
+                        cvtLayout2Phi(place.layout),
+                        cvtPrecision2Phi(place.precision));
+}
+
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(cvtTargetFromPhi(tensor_arg.backend),
+               cvtPrecisionFromPhi(tensor_arg.dtype),
+               cvtLayoutFromPhi(tensor_arg.layout));
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..b618ef3861303334b697382f11bfa4fdb4a35c7a
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target);
+TargetType cvtTargetFromPhi(phi::Backend backend);
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision);
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype);
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout);
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout);
+
+phi::KernelKey cvtPlace2Phi(const Place& place);
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg);
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/infrt_phi_base.td
deleted file mode 100644
index 907f912d9e638ba76e5010d5442381d1aa053bc2..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/phi/infrt_phi_base.td
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef PHI_BASE
-#define PHI_BASE
-
-include "mlir/IR/OpBase.td"
-
-def PHI_Dialect : Dialect {
-  let name = "phi";
-
-  let description = [{
-    The PHI host dialect.
-  }];
-
-  let cppNamespace = "::infrt::phi";
-}
-
-class AllocatorTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PHI_Dialect, place # "Allocator", traits> {
-    let summary = !strconcat("!phi.allocator_", place, " type");
-}
-
-class ContextTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PHI_Dialect, place # "Context", traits> {
-    let summary = !strconcat("!phi.context_", place, " type");
-}
-
-def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
-
-def CPU_Allocator : AllocatorTypeOf<"CPU">;
-def GPU_Allocator : AllocatorTypeOf<"GPU">;
-
-def CPU_Context : ContextTypeOf<"CPU">;
-def GPU_Context : ContextTypeOf<"GPU">;
-
-def Allocator : AnyTypeOf<[CPU_Allocator, GPU_Allocator], "Allocator type">;
-def Context : AnyTypeOf<[CPU_Context, GPU_Context], "Context type">;
-
-#endif
diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
deleted file mode 100644
index 879994907cc0d951bde838b23fd129e865a360f2..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef PHI_KERNEL
-#define PHI_KERNEL
-
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
-
-def PHI_KernelDialect : Dialect {
-  let name = "phi_kernel";
-
-  let description = [{
-    The PHI Kernel dialect.
-  }];
-
-  let cppNamespace = "::infrt::phi";
-}
-
-// PHI Kernel related ops.
-class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
-}
-
-def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x);
-  let results = (outs DenseTensor:$output);
-}
-
-#endif
-
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
deleted file mode 100644
index b7b3b061fdbe42909ac503d9d387cb8aed6bdc1a..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifdef PHI_TENSOR
-#else
-#define PHI_TENSOR
-
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
-
-def PHI_DenseTensorDialect : Dialect {
-  let name = "phi_dt";
-
-  let description = [{
-    The PHI DenseTensor dialect.
-  }];
-
-  let cppNamespace = "::infrt::phi";
-}
-
-// PHI DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
-}
-
-class CreateDenseTensorOp<string place, string dtype, string layout> 
-      : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
-  let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
-  let results = (outs DenseTensor:$output);
-}
-
-class FillDenseTensorOp<Attr attr_type, string dtype> :
-      PDT_Op<"fill_dense_tensor." # dtype> {
-  let arguments = (ins
-      DenseTensor:$input,
-      attr_type:$value
-  );
-  let results = (outs);
-}
-
-class CreateCPUAllocatorOp
-      : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> {
-  let arguments = (ins);
-  let results = (outs CPU_Allocator:$output);
-}
-
-class CreateCPUContextOp
-      : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins);
-  let results = (outs CPU_Context:$output);
-}
-
-def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">;
-def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
-def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
-def PDT_CreateContextOp_cpu : CreateCPUContextOp;
-
-def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
-  let results = (outs DenseTensor:$output);
-}
-
-#endif
diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0497b9832118f035b69eb3a1cf76c5887cd104e6
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
@@ -0,0 +1,12 @@
+#mlir_tablegen_on(infrt_phi_base DIALECT phi)
+add_mlir_dialect(infrt_phi_base phi)
+add_mlir_dialect(infrt_phi_tensor phi_dt)
+add_mlir_dialect(phi_cpu_kernels phi_cpu)
+add_mlir_dialect(phi_gpu_kernels phi_gpu)
+
+#mlir_tablegen_on(infrt_phi_tensor)
+
+gather_srcs(infrt_src SRCS
+    phi_base.cc 
+    infrt_phi_tensor.cc
+    phi_kernels.cc)
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
new file mode 100644
index 0000000000000000000000000000000000000000..671646b9259ccfd2399862d71d6860db93608eb8
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -0,0 +1,40 @@
+#ifndef PHI_BASE
+#define PHI_BASE
+
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+
+def PHI_Dialect : Dialect {
+  let name = "phi";
+
+  let description = [{
+    The PHI host dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+  let useDefaultTypePrinterParser = 1;
+}
+
+def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
+
+class PHI_Type<string type, list<Trait> traits = []>
+   : TypeDef<PHI_Dialect, type, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
+
+def Allocator : PHI_Type<"Allocator"> {
+   let mnemonic = "allocator";
+   let parameters = (ins
+     TargetParam:$target
+   );
+   let assemblyFormat = "`<` $target `>`";
+ }
+
+ def Context : PHI_Type<"Context"> {
+   let mnemonic = "context";
+   let parameters = (ins
+     TargetParam:$target
+   );
+   let assemblyFormat = "`<` $target `>`";
+ }
+
+#endif
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
new file mode 100644
index 0000000000000000000000000000000000000000..ee23470fc754a56ef323c167613f7f32982eedd8
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -0,0 +1,37 @@
+#ifndef PHI_KERNEL
+#define PHI_KERNEL
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
+
+def PHI_CPUKernelDialect : Dialect {
+  let name = "phi_cpu";
+
+  let description = [{
+    The PHI CPU Kernel dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+def PHI_GPUKernelDialect : Dialect {
+  let name = "phi_gpu";
+
+  let description = [{
+    The PHI GPU Kernel dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+// PHI Kernel related ops.
+class PDTCPU_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_CPUKernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
+}
+
+// PHI Kernel related ops.
+class PDTGPU_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_GPUKernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
+}
+
+#endif
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
similarity index 71%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.cc
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
index 9df1a47031b1f726578291f628cda7d12900bcb7..64780294be92b86bcf29d3cb2045434cc6479517 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 
 #include <mlir/IR/BuiltinTypes.h>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"
 
 namespace infrt {
 namespace phi {
@@ -25,7 +25,7 @@ namespace phi {
 void PHIDenseTensorDialect::initialize() {
 #define GET_OP_LIST
   addOperations<
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
       >();
 }
 
@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
similarity index 83%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.h
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
index 2780f9759185ef45bc19f43fc621f46eabbe7a66..9a92558daab0376d430fe04b853a810cf42b6e85 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
@@ -29,11 +29,11 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
new file mode 100644
index 0000000000000000000000000000000000000000..21c4669b645fb6c7622fb01ae1c7bacaee0f5ca2
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -0,0 +1,64 @@
+#ifdef PHI_TENSOR
+#else
+#define PHI_TENSOR
+
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+
+def PHI_DenseTensorDialect : Dialect {
+  let name = "phi_dt";
+
+  let description = [{
+    The PHI DenseTensor dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+// PHI DenseTensor related Op.
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
+  mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
+
+class CreateDenseTensorOp 
+      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+  let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
+    LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
+  let results = (outs DenseTensor:$output);
+}
+
+class FillDenseTensorOp<Attr attr_type, string dtype> :
+      PDT_Op<"fill_dense_tensor." # dtype> {
+  let arguments = (ins
+      DenseTensor:$input,
+      attr_type:$value
+  );
+  let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
+}
+
+class PrintDenseTensorOp:
+      PDT_Op<"print_tensor"> {
+  let arguments = (ins DenseTensor:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
+}
+
+class CreateContextOp<string target>
+      : PDT_Op<"create_context." # target, [NoSideEffect]> {
+  let arguments = (ins);
+  let results = (outs Context:$output);
+}
+
+def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
+def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
+def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_PrintDenseTensor : PrintDenseTensorOp;
+
+def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
+  let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
+  let results = (outs DenseTensor:$output);
+}
+
+#endif
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8095d7f3f13fcfbf9b2ccab6db182850633d632
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
+#include <llvm/include/llvm/ADT/TypeSwitch.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
+
+namespace infrt {
+namespace phi {
+
+void PHIDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
+      >();
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+      >();
+}
+
+}  // namespace phi
+}  // namespace infrt
+
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
similarity index 81%
rename from paddle/infrt/dialect/phi/phi_base.h
rename to paddle/infrt/dialect/phi/ir/phi_base.h
index 11174290f92bd18fdc91588d7eba89f61bb05413..0ea1973a7331b8a34bf2a286cb55e19a4d09118b 100644
--- a/paddle/infrt/dialect/phi/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,12 +18,12 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
+#include "paddle/infrt/dialect/infrt/common_type.h"
 
-#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
-#define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
 
 namespace mlir {
 namespace OpTrait {
@@ -39,6 +39,9 @@ class PhiOpTrait : public OpTrait::TraitBase<ConcreteType, PhiOpTrait> {
 }  // namespace OpTrait
 }  // namespace mlir
 
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
+
 namespace infrt {
 namespace phi {}  // namespace phi
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a837b83fc24095982f50fe5c1720e13723fbdb
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+#include <mlir/IR/BuiltinTypes.h>
+
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
+
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
+
+namespace infrt {
+namespace phi {
+
+void PHICPUKernelDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
+      >();
+}
+
+void PHIGPUKernelDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
+      >();
+}
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..b84d1b2b7294baf789fe4e1f3911edede8172cf7
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc"
+
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 63869b7d7b9ea4fd7841dfe352a3b79e9cd18725..d1763897b4a1320179134ede14fe404aee4a6a76 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -14,102 +14,79 @@
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include <glog/logging.h>
-#include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/phi/kernels/declarations.h"
+
 namespace infrt {
 
-phi::Backend cvtTarget2Phi(TargetType target) {
+std::string getPhiTargetPrefix(TargetType target) {
   switch (target) {
     case TargetType::CPU:
-      return phi::Backend::CPU;
+      return "phi_cpu.";
     case TargetType::GPU:
-      return phi::Backend::GPU;
-    default:
-      return phi::Backend::UNDEFINED;
-  }
-}
-
-TargetType cvtTargetFromPhi(phi::Backend backend) {
-  switch (backend) {
-    case phi::Backend::CPU:
-      return TargetType::CPU;
-    case phi::Backend::GPU:
-      return TargetType::GPU;
+      return "phi_gpu.";
     default:
-      return TargetType::UNK;
+      LOG(FATAL) << "UnSupported target type !";
+      return std::string();
   }
 }
-
-phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+std::string getPhiPrecisionSuffix(PrecisionType precision) {
   switch (precision) {
     case PrecisionType::FLOAT32:
-      return phi::DataType::FLOAT32;
-      break;
+      return ".float32";
     case PrecisionType::FLOAT16:
-      return phi::DataType::FLOAT16;
-    default:
-      return phi::DataType::UNDEFINED;
-  }
-}
-
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
-  switch (datatype) {
-    case phi::DataType::FLOAT32:
-      return PrecisionType::FLOAT32;
-    case phi::DataType::FLOAT16:
-      return PrecisionType::FLOAT16;
+      return ".float16";
+    case PrecisionType::FLOAT64:
+      return ".float64";
+    case PrecisionType::UINT8:
+      return ".uint8";
+    case PrecisionType::INT8:
+      return ".int8";
+    case PrecisionType::INT16:
+      return ".int16";
+    case PrecisionType::INT32:
+      return ".int32";
+    case PrecisionType::INT64:
+      return ".int64";
+    case PrecisionType::COMPLEX64:
+      return ".complex64";
+    case PrecisionType::COMPLEX128:
+      return ".complex128";
+    case PrecisionType::BOOL:
+      return ".bool";
     default:
-      return PrecisionType::UNK;
+      LOG(FATAL) << "UnSupported precision type !";
+      return std::string();
   }
 }
-
-phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+std::string getPhiLayoutSuffix(LayoutType layout) {
   switch (layout) {
     case LayoutType::NCHW:
-      return phi::DataLayout::NCHW;
+      return ".nchw";
     case LayoutType::NHWC:
-      return phi::DataLayout::NHWC;
-    default:
-      return phi::DataLayout::UNDEFINED;
-  }
-}
-
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
-  switch (layout) {
-    case phi::DataLayout::NCHW:
-      return LayoutType::NCHW;
-    case phi::DataLayout::NHWC:
-      return LayoutType::NHWC;
+      return ".nhwc";
+    case LayoutType::ANY:
+      return ".any";
     default:
-      return LayoutType::UNK;
+      LOG(FATAL) << "UnSupported layout type !";
+      return std::string();
   }
 }
 
-phi::KernelKey cvtPlace2Phi(const Place& place) {
-  return phi::KernelKey(cvtTarget2Phi(place.target),
-                        cvtLayout2Phi(place.layout),
-                        cvtPrecision2Phi(place.precision));
-}
-
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
-  return Place(cvtTargetFromPhi(tensor_arg.backend),
-               cvtPrecisionFromPhi(tensor_arg.dtype),
-               cvtLayoutFromPhi(tensor_arg.layout));
-}
-
 std::vector<PhiKernelDesc> getCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces) {
   std::vector<PhiKernelDesc> candidate_kernels;
   PhiKernelDesc phi_kernel_desc;
   phi::KernelKeyMap kernel_key_map =
       phi::KernelFactory::Instance().SelectKernelMap(name);
-  for (const Place& place : valid_palces) {
+  for (Place place : valid_palces) {
     phi::KernelKey kernel_key = cvtPlace2Phi(place);
     if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
       kernel_key = phi::KernelKey(kernel_key.backend(),
                                   phi::DataLayout::ALL_LAYOUT,
                                   kernel_key.dtype());
       if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
+      place.layout = LayoutType::ANY;
     }
     phi_kernel_desc.kernelType = place;
     phi_kernel_desc.inputsType.clear();
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index b74107f674e51f6ca09c864d197d9334a08666ac..34fd2f0f62dcd9b793f9157003bfd3772d0e1307 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -26,6 +26,10 @@ struct PhiKernelDesc {
   Place kernelType;                // kernel place
 };
 
+std::string getPhiTargetPrefix(TargetType target);
+std::string getPhiPrecisionSuffix(PrecisionType precision);
+std::string getPhiLayoutSuffix(LayoutType layout);
+
 std::vector<PhiKernelDesc> getCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces);
 
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
index df3472aa01dfb8bfa0e7f6122410c1b4788cd359..fb00a3de3fc0c82dce2489c0f412c64118e3101e 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -18,11 +18,14 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OperationSupport.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
@@ -58,8 +61,8 @@ void phiOpCvtPass::convertStage() {
       continue;
     }
 
-    phi::KernelSignature kernel_sign =
-        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+    ::phi::KernelSignature kernel_sign =
+        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
             ProtoArgumentMappingContext(op));
     // resort input&output according to kernel_sign
     ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
@@ -104,13 +107,82 @@ void phiOpCvtPass::diapatchStage() {
     infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
     if (nullptr != kernel_op) worklist.push_back(kernel_op);
   }
-  // ToDo: implementation in the next PR
-  while (!worklist.empty()) {
-    // infrt::KernelOp kernel_op = worklist.back();
-    worklist.pop_back();
-    // std::string kernel_name = kernel_op.name().str();
-    // std::vector<PhiKernelDesc> candidates =
-    //     getCandidateKernels(kernel_name, valid_places_);
+
+  mlir::OpBuilder builder(&block, block.begin());
+  std::map<TargetType, mlir::Value> phi_context;
+  for (infrt::KernelOp kernel_op : worklist) {
+    std::string kernel_name = kernel_op.name().str();
+    std::vector<PhiKernelDesc> candidates =
+        getCandidateKernels(kernel_name, valid_places_);
+    if (candidates.empty()) {
+      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
+      continue;
+    }
+    builder.setInsertionPoint(kernel_op);
+
+    // Todo: Implimentation the concrete pass pick strategy
+    const PhiKernelDesc &phi_kernel_desc = candidates.front();
+
+    kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
+                  kernel_name +
+                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
+                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
+
+    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
+    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
+
+    if (phi_context.find(phi_kernel_desc.kernelType.target) ==
+        phi_context.end()) {
+      switch (phi_kernel_desc.kernelType.target) {
+        case TargetType::CPU: {
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateCPUContextOp>(
+                      kernel_op.getLoc(),
+                      phi::ContextType::get(kernel_op.getContext(),
+                                            TargetType::CPU))
+                  .output();
+          phi_context[TargetType::CPU] = context_value;
+        } break;
+        case TargetType::GPU:
+        case TargetType::UNK:
+        default:
+          LOG(FATAL) << "Unsupported TargetType";
+          break;
+      }
+    }
+    operation_state.addOperands(
+        phi_context.at(phi_kernel_desc.kernelType.target));
+    for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) {
+      mlir::Value input = kernel_op.getOperand(index);
+      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
+          kernel_op.getLoc(),
+          DenseTensorType::get(kernel_op.getContext(),
+                               phi_kernel_desc.inputsType[index].target,
+                               phi_kernel_desc.inputsType[index].precision,
+                               phi_kernel_desc.inputsType[index].layout),
+          input);
+      operation_state.addOperands(cvt_tensor_type_op.output());
+    }
+    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
+         ++index) {
+      operation_state.addTypes(
+          DenseTensorType::get(kernel_op.getContext(),
+                               phi_kernel_desc.outputsType[index].target,
+                               phi_kernel_desc.outputsType[index].precision,
+                               phi_kernel_desc.outputsType[index].layout));
+    }
+    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
+    mlir::Operation *phi_operation = builder.createOperation(operation_state);
+    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
+         ++index) {
+      mlir::Value input = phi_operation->getResult(index);
+      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
+          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
+      kernel_op.getResult(index).replaceAllUsesWith(
+          cvt_tensor_type_op.output());
+    }
+    kernel_op.erase();
   }
 }
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index 843b19d217feb332a278c80378aaeb856442de9a..e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
-class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
+class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
  public:
   // only support op in pd dialect
   explicit ProtoArgumentMappingContext(mlir::Operation* op)
@@ -46,6 +46,8 @@ class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
   bool IsDenseTensorOutput(const std::string& name) const override;
   bool IsSelectedRowsOutput(const std::string& name) const override;
 
+  bool IsForInferShape() const override { return false; }
+
  private:
   mlir::Operation* op_;
   const std::unordered_map<std::string, uint8_t>& input_map_;
diff --git a/paddle/infrt/dialect/phi/phi_base.cc b/paddle/infrt/dialect/phi/phi_base.cc
deleted file mode 100644
index a1caa40f6383b5016a9e237733a0b3ef016cbc97..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/phi/phi_base.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/phi/phi_base.h"
-
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-#include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
-
-namespace infrt {
-namespace phi {
-
-void PHIDialect::printType(::mlir::Type type,
-                           mlir::DialectAsmPrinter& os) const {
-  if (type.isa<CPUAllocatorType>()) {
-    os << "CPU_Allocator";
-    return;
-  }
-  if (type.isa<GPUAllocatorType>()) {
-    os << "GPU_Allocator";
-    return;
-  }
-  if (type.isa<CPUContextType>()) {
-    os << "CPU_Context";
-    return;
-  }
-  if (type.isa<GPUContextType>()) {
-    os << "GPU_Context";
-    return;
-  }
-  llvm_unreachable("unexpected 'allocator/context' type kind");
-}
-
-void PHIDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"  // NOLINT
-      >();
-  addTypes<
-#define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
-      >();
-}
-
-mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
-  llvm::StringRef keyword;
-  if (parser.parseKeyword(&keyword)) return mlir::Type();
-  if (keyword == "CPU_allocator") {
-    return CPUAllocatorType::get(parser.getContext());
-  } else if (keyword == "GPU_allocator") {
-    return GPUAllocatorType::get(parser.getContext());
-  } else if (keyword == "CPU_context") {
-    return CPUContextType::get(parser.getContext());
-  } else if (keyword == "GPU_context") {
-    return GPUContextType::get(parser.getContext());
-  } else {
-    llvm_unreachable("unexpected 'allocator/context' type kind");
-  }
-
-  return mlir::Type();
-}
-
-}  // namespace phi
-}  // namespace infrt
-
-#define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
index 4e99661a6a20590e7d36c1cf3a0e1e5d334b2464..a2808a00cb67da582ce4fc8b995772725d79e47e 100644
--- a/paddle/infrt/dialect/phi/phi_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,37 +11,46 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <llvm/Support/CommandLine.h>
-#include <mlir/Pass/PassManager.h>
-#include <iostream>
-#include <string>
-#include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 
-int main(int argc, char** argv) {
-  static llvm::cl::opt<std::string> input_file(
-      llvm::cl::Positional,
-      llvm::cl::desc("Specify input filename"),
-      llvm::cl::init("-"));
-
-  llvm::cl::ParseCommandLineOptions(argc, argv);
+#include "paddle/infrt/host_context/paddle_mlir.h"
 
-  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
-  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+void print_usage() {
+  std::cout << "Error inputs format, two kinds of inputs are supported:\n";
+  std::cout << "    [1] ./paddle-mlir-convert $path_to_model_file "
+               "$path_to_params_file\n";
+  std::cout << "    [2] ./paddle-mlir-convert $path_to_model_dir(__model__ + "
+               "params)\n";
+}
 
-  module->dump();
-  mlir::PassManager pm(context);
+bool parse_inputs(int argc,
+                  char** argv,
+                  std::string* model_file_name,
+                  std::string* params_file_name) {
+  switch (argc) {
+    case 1: {
+      print_usage();
+      return false;
+    }
+    case 2: {
+      *model_file_name = std::string(argv[1]) + std::string("/__model__");
+      *params_file_name = std::string(argv[1]) + std::string("/params");
+      return true;
+    }
+    case 3: {
+      *model_file_name = argv[1];
+      *params_file_name = argv[2];
+      return true;
+    }
+    default: { return false; }
+  }
+}
 
-  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
-  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
-                                             infrt::PrecisionType::FLOAT32,
-                                             infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
-  if (mlir::failed(pm.run(*module))) {
-    std::cout << "\npass failed!\n" << std::endl;
-    return 4;
+int main(int argc, char** argv) {
+  std::string model_file_name;
+  std::string params_file_name;
+  if (parse_inputs(argc, argv, &model_file_name, &params_file_name)) {
+    MLIRModelGenImpl myGen;
+    auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+    module_.dump();
   }
-  module->dump();
-  return 0;
 }
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..559fb90a64a7868c9c150e12e881d73df7a4aaf2
--- /dev/null
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+int main(int argc, char** argv) {
+  static llvm::cl::opt<std::string> input_file(
+      llvm::cl::Positional,
+      llvm::cl::desc("Specify input filename"),
+      llvm::cl::init("-"));
+
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+  context->loadAllAvailableDialects();
+  module->dump();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+  module->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 701391a750354938efe3703ef8642b21f8a878ea..68ca1559acee03580eea0842bfbac3593d418c02 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -7,7 +7,7 @@ include "paddle/infrt/dialect/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
-        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">, ConstantAttr<SI32Attr, "1">),
+        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">),
         (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>;
 
 //TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM
diff --git a/paddle/infrt/dialect/tensorrt/trt_dialect_types.h b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3edcec1edb2d7774038e638f8c20f94a3e7166
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/Types.h"
+
+namespace infrt {
+namespace trt {
+
+class EngineType
+    : public mlir::Type::TypeBase<EngineType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static EngineType get();
+  static EngineType get(mlir::MLIRContext *context);
+};
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 17633a4e8e99293524e5ca635069267e27c2a603..ad6b136463a71dcc2fcd9ce2b4e2da6f68e88dd2 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -97,7 +97,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<mlir::pd::ReturnOp>(loc, outputs);
+  builder.create<::infrt::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index ebd7a4ac4bd3712d98df4a097682787b3977ebfb..803e53e3244f92134928e1105a8248e9f49e5432 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt_base.h"
 
 namespace infrt {
 namespace trt {
@@ -24,37 +25,37 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.return" %m
+ *     "infrt.return" (%m)
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.return" %m
+ *      "infrt.return" (%m)
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.return" %m
+ *      "infrt.return" (%m)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "infrt.return" (%d, %f)..
+ * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.return" %n, %s
+ *     "infrt.return" (%n, %s)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "infrt.return" (%d, %f)
  * }
  */
 class TRTGraphFusePass
     : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index f24b9cc40cdcc2b065ea033cb03638e8d292df89..e3a7b455024c65d40ccbafb28fba9e9b0ead0369 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -16,7 +16,6 @@
 
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 51f84227243403f5a2299d820acad1b49592abc3..1c44a13cf9dfb65a1747a596dc1012e7f54d792e 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt_base.h"
 
 namespace infrt {
 namespace trt {
@@ -25,30 +26,29 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.return" (%n, %s)
+ *     "infrt.return" (%n, %s)...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)...
  * }
  */
 class TRTGraphSplitPass
     : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   void runOnFunction() override;
   explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td
index 5722f17d597870c585328f236f72f8896b72835e..128960ee03e03029ac3681b0184e4359c0436dde 100755
--- a/paddle/infrt/dialect/tensorrt/trt_op_base.td
+++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td
@@ -27,6 +27,9 @@ class TRT_PaddleAttr <string name, string description> :
       Attr<CPred<"$_self.isa<mlir::trt::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
+def TRT_EngineType :
+      Type<CPred<"$_self.isa<::infrt::trt::EngineType>()">, "!trt.engine">,
+      BuildableType<"getType<::infrt::trt::EngineType>()">;
 
 //===----------------------------------------------------------------------===//
 // PaddlePaddle type definitions
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index e34308a2f0fa8c3c0142a62324f00c29b61fd7d3..1be5f4dbc39d7699b6d8a36cfb3e164694e908c1 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -12,22 +12,72 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Transforms/DialectConversion.h"
+#include <mlir/IR/Builders.h>
+#include <mlir/Transforms/DialectConversion.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
-using namespace mlir;
+struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
+  PD2TRT_GraphLower(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
+    auto casted_op = ::llvm::dyn_cast<mlir::pd::GraphOp>(op);
+    ::mlir::Operation::operand_range inputs = casted_op.inputs();
+    auto ods_loc = rewriter.getFusedLoc(op->getLoc());
+    CreateEngineOp create_engine_op;
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> trt_inputs;
+    for (auto v : inputs) {
+      trt_inputs.push_back(v);
+    }
+    create_engine_op = rewriter.create<CreateEngineOp>(
+        ods_loc,
+        ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
+        trt_inputs,
+        true /*run_once*/);
+    ::mlir::Block *block = new ::mlir::Block;
+    block->getOperations().splice(block->begin(),
+                                  casted_op.getBody()->getOperations(),
+                                  casted_op.getBody()->begin(),
+                                  casted_op.getBody()->end());
+    create_engine_op.body().push_back(block);
+
+    // trt.execute
+    // outputs
+    ::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types;
+    for (auto v : casted_op.getODSResults(0)) {
+      execute_outputs_types.push_back(v.getType());
+    }
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> execute_inputs(
+        create_engine_op.getODSResults(0));
+    for (auto v : inputs) {
+      execute_inputs.push_back(v);
+    }
+    auto execute_op = rewriter.create<ExecuteOp>(
+        ods_loc, execute_outputs_types, execute_inputs);
+
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(op, replace_values);
+    return ::mlir::success();
+  }
+};
 
 void TRTOpConverterPass::runOnOperation() {
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
-  ConversionTarget target(getContext());
+  ::mlir::ConversionTarget target(getContext());
 
   // We define the specific operations, or dialects, that are legal targets for
   // this lowering. In our case, we are lowering to TensorRTDialect from
@@ -36,13 +86,14 @@ void TRTOpConverterPass::runOnOperation() {
 
   // Now that the conversion target has been defined, we just need to provide
   // the set of patterns that will lower the TensorRT operations.
-  RewritePatternSet patterns(&getContext());
+  ::mlir::RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
+  patterns.add<PD2TRT_GraphLower>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
   // operations were not converted successfully.
-  if (failed(
+  if (::mlir::failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
     signalPassFailure();
 }
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index 0adbf11b89144b0a9e14dc158e2eab1c56e2563a..7550d8c84e19504fc0f41067c1194703a55410ba 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
@@ -23,27 +24,26 @@ namespace trt {
  * trtOpConverterPass.
  *
  * source ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *   %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.return" %n, %s
+ *     "infrt.return" (%n, %s)...
  *   } ...
- *   "pd.fetch" %d, %f
+ *   "infrt.return" (%d, %f)...
  * }
  *
  * destination ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "pd.graph"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %engine = "trt.create_engine"(%a) ({
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "pd.return" %n, %s
- *   } ...
- *   "pd.fetch" %d, %f
+ *     "infrt.return" (%n, %s)...
+ *   }){run_once = true} ...
+ *   %d, %f = "trt.execute"(%engine, %a)...
+ *   "infrt.return" (%d, %f)...
  * }
  */
 struct TRTOpConverterPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 176fdb7a2e054ac2e0c952c7af27995cf8e3c433..13b7f1aee55d2a2d30822a878bbd50d385411f43 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 #include <mlir/IR/Builders.h>
+#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
@@ -33,12 +35,10 @@ void TRTOpTellerPass::runOnFunction() {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    auto op1 = ::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op);
-    if (op1) continue;
-    auto op2 = ::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op);
-    if (op2) continue;
-    auto op3 = ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op);
-    if (op3) continue;
+    if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
     auto graph_op = builder.create<mlir::pd::GraphOp>(
@@ -55,7 +55,7 @@ void TRTOpTellerPass::runOnFunction() {
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<mlir::pd::ReturnOp>(loc, op->getResults());
+    builder.create<::infrt::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 8b9a16376ce5527b2133c9f2c2ecea928fb4cd8f..b9e461c8633d906fd46e9f7d6799e8a157915048 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt_base.h"
 
 namespace infrt {
 namespace trt {
@@ -24,30 +25,28 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return"(%d, %f) ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.return" (%m)
+ *     "infrt.return" (%m)
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.return" (%m)
+ *      "infrt.return" (%m)
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.return" (%m)
+ *      "infrt.return" (%m)
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
@@ -55,6 +54,7 @@ namespace trt {
 class TRTOpTellerPass
     : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 35b7967892cafcea66c382e5681ee43480b02735..d5222976625a2adece9a87c8952dba10137ae9ba 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,25 +11,58 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/OpImplementation.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
+EngineType EngineType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+EngineType EngineType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
 TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
     : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
+  addTypes<EngineType>();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
 }
 
+mlir::Type TensorRTDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+  // parse trt dilaect types, for example: !trt.engine
+  if (keyword == "engine") {
+    return infrt::trt::EngineType::get(getContext());
+  }
+  parser.emitError(parser.getCurrentLocation(), "unknown infrt::trt type: ")
+      << keyword;
+  return mlir::Type();
+}
+
+void TensorRTDialect::printType(mlir::Type type,
+                                mlir::DialectAsmPrinter &printer) const {
+  // print trt dilaect types, for example: !trt.engien
+  if (type.isa<infrt::trt::EngineType>()) {
+    printer << "engine";
+    return;
+  }
+  llvm_unreachable("unknown infrt::trt type.");
+}
+
 }  // namespace trt
 }  // namespace infrt
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index a37491ec1abc7fd423fef23df5170936d2a769c7..44444232915bad7d25b0ecedfa8e8427f4567e49 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,14 +28,20 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
 namespace trt {
 
 class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext *context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const;  // NOLINT
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const;  // NOLINT
 };
 
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 8e3dfffff54f13cc6d1f23c3459ed45257082d4f..132a1d7805bdb85af8716e384ec29357a6ff68ad 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,25 +7,24 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
-def TRT_FetchOp : TRT_Op<"fetch", [Terminator]> {
-  let summary = "TensorRT engine return operation";
-  let description = [{
-    The `trt.fetch` operation terminates and returns values for the
-    `trt.graph` operation.
-    }];
-
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
-}
 
-def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
-  let summary = "trt Graph Op";
+def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
+  let summary = "trt CreateEngine Op";
   let description = [{
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
-  let results = (outs Variadic<TRT_Tensor>:$outputs);
+  let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
+  let results = (outs TRT_EngineType:$output);
+}
 
+def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
+  let summary = "trt execute Op";
+  let description = [{
+    Describe a tensorrt runtime.
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
+  let results = (outs Variadic<TRT_Tensor>:$output);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt
index 11304742ecd41345aa45bb4e35e064d9745bc42f..14cbea70ca8415a2c53b0bbfb76750d8ad7354eb 100644
--- a/paddle/infrt/host_context/CMakeLists.txt
+++ b/paddle/infrt/host_context/CMakeLists.txt
@@ -12,6 +12,7 @@ gather_srcs(infrt_src SRCS
     function.cc
     mlir_function_executable.cc
     mlir_program_executor.cc
+    paddle_mlir.cc
     )
 
 cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS})
@@ -21,7 +22,7 @@ cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${ML
 cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS})
 cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
-add_executable(paddle-mlir-convert paddle_mlir.cc paddle_mlir_converter.cc)
+add_executable(paddle-mlir-convert paddle_mlir_converter.cc)
 target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS})
 add_executable(infrtexec mlir_exec.cc)
 target_link_libraries(infrtexec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
index 14e88be4b96bb58df87db3191db8bae444c4cc3d..266c145f47839afb31d708d0863e2e90905253ee 100644
--- a/paddle/infrt/host_context/kernel_frame.cc
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -30,28 +30,21 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
 std::string KernelFrame::DumpArgTypes() const {
   std::stringstream ss;
   for (auto* value : GetValues(0, GetNumElements())) {
-    if (value->is_type<bool>()) {
-      ss << "bool (" << &value->get<bool>() << "), ";
-    } else if (value->is_type<tensor::DenseHostTensor>()) {
-      ss << "DenseHostTensor(" << &value->get<tensor::DenseHostTensor>()
-         << "), ";
-    } else if (value->is_type<float>()) {
-      ss << "float(" << &value->get<float>() << "), ";
-    } else if (value->is_type<int>()) {
-      ss << "int(" << &value->get<int>() << "), ";
-    } else if (value->is_type<phi::DenseTensor>()) {
-      ss << "phi::DenseTensor(" << &value->get<phi::DenseTensor>() << "), ";
-    } else if (value->is_type<phi::MetaTensor>()) {
-      ss << "phi::MetaTensor(" << &value->get<phi::MetaTensor>() << "), ";
-    } else if (value->is_type<::phi::CPUContext>()) {
-      ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), ";
-    } else if (value->is_type<host_context::None>()) {
-      ss << "none(" << &value->get<host_context::None>() << "), ";
-    } else if (value->is_type<backends::CpuPhiContext>()) {
-      ss << "CpuPhiContext(" << &value->get<backends::CpuPhiContext>() << "), ";
-    } else {
-      ss << "typeid: " << value->index() << ", ";
-    }
+#define DUMP(type_name)                                    \
+  if (value->is_type<type_name>()) {                       \
+    ss << #type_name << &value->get<type_name>() << "), "; \
+  }
+    DUMP(bool);
+    DUMP(tensor::DenseHostTensor);
+    DUMP(float);
+    DUMP(int);
+    DUMP(::phi::DenseTensor);
+    DUMP(::phi::MetaTensor);
+    DUMP(::phi::CPUContext);
+    DUMP(host_context::None);
+    DUMP(backends::CpuPhiContext);
+#undef DUMP
+    ss << "typeid: " << value->index() << ", ";
   }
   return ss.str();
 }
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 79717ba2cc034650726f9e88c9dc31f1f1349c66..90bcb1df220c0f4c558ece80a09fccc93aada41c 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <llvm/Support/CommandLine.h>
-
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
 
@@ -29,6 +29,9 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
 #endif
 
@@ -58,6 +61,7 @@ int main(int argc, char** argv) {
   kernel::RegisterControlFlowKernels(&registry);
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
+  kernel::RegisterInferShapeLaunchers(&registry);
 #endif
 
   // load extra shared library
@@ -79,6 +83,24 @@ int main(int argc, char** argv) {
     }
   }
 
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+#ifdef INFRT_WITH_PHI
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+#endif
+
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+
   host_context::TestMlir(module.get(), &registry);
 
   std::cout << std::endl;
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 17e6f7cb563d25186f9a76de8fe67af2ddb90e7b..a901c323ec03a418a32eee3cb8ea17708e38bdb9 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -173,6 +173,36 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::TargetAttr>()) return boost::none;
+  if (attr.isa<::infrt::TargetAttr>()) {
+    return attr.cast<::infrt::TargetAttr>().getTarget();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::LayoutAttr>()) return boost::none;
+  if (attr.isa<::infrt::LayoutAttr>()) {
+    return attr.cast<::infrt::LayoutAttr>().getLayout();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none;
+  if (attr.isa<::infrt::PrecisionAttr>()) {
+    return attr.cast<::infrt::PrecisionAttr>().getPrecision();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -292,6 +322,13 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v =
+                   EmitAttribute<::infrt::PrecisionType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 1c36b04f366bf5d993cc686804f9bb58aeb36f2b..6afef5935c73450b4865c0e02593aa372299c95f 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -56,6 +56,7 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
+
   return module_;
 }
 
@@ -78,7 +79,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
 llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
     const infrt::paddle::framework_proto::ProgramDesc &program) {
   llvm::SmallVector<mlir::Type, 4> operandTypes;
-  operandTypes.push_back(infrt::dt::TensorMapType::get(context_));
+  operandTypes.push_back(infrt::DenseTensorMapType::get(context_));
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() != "feed") continue;
     for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
@@ -143,13 +144,14 @@ void MLIRModelGenImpl::UpdateModelParams(
     const infrt::paddle::framework_proto::ProgramDesc &program,
     mlir::FuncOp *mainFunc) {
   // update input vars
+  int input_index = 1;
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() == "feed") {
       for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
         // update input variables
         auto &in = op_desc.outputs()[var_idx];
         std::string input_var_name = in.arguments(0);
-        ::mlir::Value input_ = mainFunc->getArgument(1);
+        ::mlir::Value input_ = mainFunc->getArgument(input_index++);
         params_map_.insert(
             std::pair<std::string, mlir::Value>(input_var_name, input_));
       }
@@ -211,7 +213,6 @@ void MLIRModelGenImpl::buildOperation(
     const infrt::paddle::framework_proto::OpDesc &op_) {
   const std::string &op_name = "pd." + op_.type();
   mlir::Location loc = mlir::UnknownLoc::get(context_);
-
   llvm::SmallVector<mlir::Value, 4> operands = GetOpInputValue(op_);
   llvm::SmallVector<mlir::Type, 4> resultTypes = GetOpOutputType(op_);
   llvm::SmallVector<mlir::NamedAttribute, 4> attrs = GetOpAttributes(op_);
@@ -227,7 +228,6 @@ llvm::SmallVector<mlir::Value, 4> MLIRModelGenImpl::GetOpInputValue(
   std::unordered_map<std::string, uint8_t> inputs_info = {};
   if (pd_dialect_inputs_info_map_.count(op_.type()))
     inputs_info = pd_dialect_inputs_info_map_.at(op_.type());
-
   for (int var_idx = 0; var_idx < op_.inputs_size(); ++var_idx) {
     auto &var = op_.inputs(var_idx);
     if (!var.arguments().empty()) {
@@ -249,10 +249,8 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
   // update op outputs info
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
     auto &var_name = op_.outputs(var_idx).arguments()[0];
-
     if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
       continue;
-
     // update persistable tensors
     for (int i = 0; i < main_block_.vars_size(); i++) {
       auto var_desc = main_block_.vars(i);
@@ -315,7 +313,6 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
   llvm::ArrayRef<mlir::StringAttr> attr_names_ =
       registered_op_name_.getAttributeNames();
   std::vector<mlir::StringAttr> attr_names_vec_ = attr_names_.vec();
-
   // update attrs
   for (int attrs_num = 0; attrs_num < op_.attrs_size(); attrs_num++) {
     auto attr_name_ = op_.attrs(attrs_num).name();
@@ -351,11 +348,17 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
 void MLIRModelGenImpl::RegisterOpOutputVars(
     const infrt::paddle::framework_proto::OpDesc &op_,
     mlir::Operation *mlir_op_) {
+  std::unordered_map<std::string, uint8_t> pd_dialect_outputs_info =
+      pd_dialect_outputs_info_map_.at(op_.type());
+
   // op outputs
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
+    if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
+      continue;
     auto &var_name = op_.outputs(var_idx).arguments()[0];
+    int index = pd_dialect_outputs_info[op_.outputs(var_idx).parameter()];
     // output name
-    auto var_ = mlir_op_->getResult(var_idx);
+    auto var_ = mlir_op_->getResult(index);
     params_map_.insert(std::pair<std::string, mlir::Value>(var_name, var_));
   }
 }
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 3f40490557290fcc34a188882c4d4d251f4ba16e..822ee108c897cb9b221ada4bd1d5e6cf04e13e98 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,14 +24,6 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPhiContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::CPUContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::DenseTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::MetaTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
 
 const char* Value::type_info() const { return __type_info__; }
 
@@ -67,6 +59,10 @@ void CopyTo(const Value& from, Value* to) {
           to->data = reinterpret_cast<std::vector<int64_t> const&>(arg);
         else if (std::is_same<T, tensor::TensorMap>::value)
           to->data = reinterpret_cast<tensor::TensorMap const&>(arg);
+#ifdef INFRT_WITH_PHI
+        else if (std::is_same<T, ::phi::DenseTensor>::value)
+          to->data = reinterpret_cast<::phi::DenseTensor const&>(arg);
+#endif
         else
           LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
       },
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index eb9a2092657aa079ee6a4007d7ded9f8896e93aa..86df3508cf813628b4a8ba8412ce93d6b1dfc5a2 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -22,6 +22,7 @@
 
 #include "paddle/infrt/common/object.h"
 #include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
 #include "paddle/infrt/host_context/function.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
@@ -64,16 +65,18 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
+            ::infrt::PrecisionType,
+            ::infrt::LayoutType,
+            ::infrt::TargetType,
 #ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPhiAllocator,
             backends::CpuPhiContext,
             ::phi::CPUContext,
-            std::vector<phi::DenseTensor>,
+            std::vector<const phi::DenseTensor*>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
-            std::vector<phi::MetaTensor>,
+            std::vector<phi::MetaTensor*>,
             phi::MetaConfig,
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
@@ -101,6 +104,9 @@ class Value : public common::Object {
   explicit Value(float x) : data(x) {}
   explicit Value(double x) : data(x) {}
   explicit Value(bool x) : data(x) {}
+  explicit Value(::infrt::TargetType x) : data(x) {}
+  explicit Value(::infrt::LayoutType x) : data(x) {}
+  explicit Value(::infrt::PrecisionType x) : data(x) {}
   explicit Value(std::string x) : data(x) {}
   explicit Value(tensor::TensorMap&& x) : data(x) {}
   explicit Value(std::vector<int16_t>&& x) : data(x) {}
@@ -112,11 +118,10 @@ class Value : public common::Object {
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
 #ifdef INFRT_WITH_PHI
-  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
 #endif
 
   template <typename T>
@@ -179,10 +184,6 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(float val);
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
-  explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPhiContext&& x);
-  explicit ValueRef(::phi::CPUContext&& x);
-  explicit ValueRef(::phi::DenseTensor&& x);
 
   using common::Shared<Value>::get;
   using common::Shared<Value>::Reset;
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index 7055c0c06d5905fa738d8df72c7110fdd82a30d2..15882d23743b020c289269ef09ae5e05999201a8 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -8,7 +8,6 @@ gather_srcs(infrt_src SRCS
     registry.cc
     dense_tensor_kernels.cc
     context_kernels.cc
-    allocator_kernels.cc
 )
 
 set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 5284f499916c309c03cbada25ab0de44d5549eec..39ef172fadef9e0f6317dec192c251c6a1df6828 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,7 +18,12 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext() { return {}; }
+::phi::CPUContext CreateCPUContext() {
+  ::phi::CPUContext ctx{};
+  ctx.Init();
+  ctx.SetAllocator(new backends::CpuPhiAllocator{});
+  return ctx;
+}
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 8082dc6c2ff2950bdcbc8a99e602b7caab2b6ad7..3e9580b91da5724b42c72224847e45715f47dbb7 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -14,14 +14,16 @@
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext();
+::phi::CPUContext CreateCPUContext();
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index ce9200b9918c0a2cfe2ff80312562375bc3dc23f..e89ee7cfe5d6f51b3206aecc6ca283e06c0e5561 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,25 +13,60 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod) {
-  return ::phi::DenseTensor(allocator,
-                            ::phi::DenseTensorMeta(::phi::DataType::FLOAT32,
-                                                   ::phi::make_ddim(dims.get()),
-                                                   ::phi::DataLayout::NCHW,
-                                                   {}));
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(cvtPrecision2Phi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             cvtLayout2Phi(layout.get()),
+                             {}));
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<int64_t>> values) {}
+                        host_context::Attribute<std::vector<float>> values) {
+  auto place = ::phi::CPUPlace();
+  float* a_data = dense_tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+    a_data[i] = (values.get())[i];
+  }
+}
+
+void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
+  case ::phi::DataType::PHI_DATATYPE: {                   \
+    DTYPE* data = dense_tensor->data<DTYPE>();            \
+    if (dense_tensor->numel() == 0) break;                \
+    std::cout << data[0];                                 \
+    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+      std::cout << "," << data[i];                        \
+    }                                                     \
+    break;                                                \
+  }
 
+  ::phi::DDim dims = dense_tensor->dims();
+  std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
+            << " values=[";
+  switch (dense_tensor->dtype()) {
+    PRINT_META_DATA(FLOAT32, float);
+    PRINT_META_DATA(INT32, int32_t);
+    default:
+      std::cout << "Error! Unsupported data type!\n";
+  }
+  std::cout << "]\n";
+#undef PRINT_META_DATA
+}
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 25daf7027e8cb1371ae40cec7e45b6ef285ef9e5..187e5c64511e83556bec50f4368ae7cbe89dda90 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -22,13 +23,16 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod);
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::PrecisionType> precision);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<int64_t>> values);
+                        host_context::Attribute<std::vector<float>> values);
+void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 2161e98fac8337a766cfcf7eaa27b4486c48dfcb..08c2e19deddfe480faec6d5468b3f222abee7e03 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("pten.add.cpu.any.fp32");
+  auto creator = registry.GetKernel("phi_cpu.add.float32.any");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 165f7f7c94377f8b9c1f9c240ee1418cab922cdc..75e3ebbf00ca54ed3fb2d0ca22bb7819300d0b2b 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -24,7 +24,8 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
        frame->GetValues(1, frame->GetNumElements() - 1)) {
     // TODO(Superjomn) To extend this.
     if (value->is_type<::phi::DenseTensor>()) {
-      values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
+      values.emplace_back(new host_context::Value{
+          ::phi::MetaTensor{&value->get<::phi::DenseTensor>()}});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index a0a5b391ea669b1358b14098e32750d709e52fe2..75c9e554778dcf1488289c6e9e46fb9652f677dd 100644
--- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -24,26 +24,6 @@
 namespace infrt {
 namespace kernel {
 
-static void FakePhiInferShape(const ::phi::MetaTensor& a,
-                              const ::phi::MetaTensor& b,
-                              bool arg_0,
-                              bool arg_1,
-                              ::phi::MetaTensor* c) {
-  LOG(INFO) << "the ptr of c: " << c;
-  LOG(INFO) << "c->numel(): " << c->numel();
-}
-
-static void FakePhiKernel(const ::phi::CPUContext& /*Context*/,
-                          const ::phi::DenseTensor& a,
-                          const ::phi::DenseTensor& b,
-                          bool arg_0,
-                          bool arg_1,
-                          ::phi::DenseTensor* c) {
-  std::cout << "@FakePhiKernel@" << std::endl;
-  LOG(INFO) << "the ptr of c: " << c;
-  LOG(INFO) << "c->numel(): " << c->numel();
-}
-
 template <typename KernelFunc,
           KernelFunc kernel,
           typename InferShapedFunc,
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 5d79814d4bec7fd5a80913f3f3c470e956526c1f..90570484179d1e555f86c55ea0e8ac4f9bc83c53 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -19,7 +19,6 @@
 
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
@@ -33,26 +32,14 @@ namespace infrt {
 namespace kernel {
 
 void RegisterPhiKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("phi_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
   registry->AddKernel("phi_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
-  registry->AddKernel(
-      "phi_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
+  registry->AddKernel("phi_dt.create_dense_tensor",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor));
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
-  registry->AddKernel(
-      "phi_dt.fake_phi_kernel",
-      std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
-                                    &FakePhiKernel,
-                                    decltype(&FakePhiInferShape),
-                                    &FakePhiInferShape>,
-                KernelLauncher<decltype(&FakePhiKernel),
-                               &FakePhiKernel,
-                               decltype(&FakePhiInferShape),
-                               &FakePhiInferShape>(),
-                std::placeholders::_1));
+  registry->AddKernel("phi_dt.print_tensor",
+                      INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 9de1350e97d1af31dc18a116ed7cb38bf0d2f4ef..d5922af9ada1f4983fe14df87c09180fe17fda19 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -49,8 +49,8 @@ void FillTensorWithConstant(Attribute<T> v, DenseHostTensor *tensor) {
   MutableDTArrayView<T>(tensor).Fill(v.get());
 }
 
-TensorMap LoadParams(const std::string &path) {
-  return *(infrt::tensor::LoadParams(path));
+TensorMap LoadParams(Attribute<std::string> path) {
+  return *(infrt::tensor::LoadParams(path.get()));
 }
 
 DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt
deleted file mode 100755
index 51fecdf907798eb7280a17b294a263fe40993fe2..0000000000000000000000000000000000000000
--- a/paddle/infrt/pass/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(phi)
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index e5cc1ec1121fb7bbff2fad7856151916d8ea0924..5ce6d8673421ba3c53c9dad6d2fd1f20298f837a 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
-    DEPENDS infrtopt infrtexec)
+    DEPENDS infrtopt infrtexec phi-ir-exec)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 8e2d3bc49b96c645fc72e33af6300307d855e5a4..1cae065bd5fb6a6a1aa06b4cd6605a240917b55f 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -19,9 +19,8 @@ func @main() {
   %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
   dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model")
   // CHECK-LABEL: loading params
-  %map = dt.load_params(%path)
+  %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"}
 
   %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
   dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e8f09f07c82c4003e23a54c7275f576f7916f853
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -0,0 +1,16 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @sign_any_float32_execute
+func @sign_any_float32_execute() {
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+  %t = "phi_dt.create_dense_tensor" (%ctx) {
+    precision=#infrt.precision<FP32>, 
+    layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+
+  // CHECK: dense_tensor: shape=shape[1], values=[1]
+  "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  Infrt.return
+}
+
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
similarity index 92%
rename from paddle/infrt/tests/dialect/pten/pten_pass.mlir
rename to paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 30ff2636ae5a41674883e63ff931629a0d140b84..61a66cb3d71a372bcd67cb96362abcb033768e4d 100644
--- a/paddle/infrt/tests/dialect/pten/pten_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt %s | FileCheck %s
+// RUN: phi-ir-exec %s
 // CHECK-LABEL: @ops
 func @ops() {
   %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..923f4e9d9d2ce6f6a24f91f04721f49712f900b5
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -0,0 +1,15 @@
+// RUN: infrtexec -i %s
+module  {
+  func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
+    %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    Infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
+  }
+  func @main() {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+    %2 = Infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    Infrt.return
+  }
+}
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
deleted file mode 100644
index f0b0b849b93cb1d42ce172c2cff90a41741c1d3d..0000000000000000000000000000000000000000
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: infrtexec -i %s | FileCheck %s
-
-// CHECK-LABEL: @fake_phi_kernel_execute
-func @fake_phi_kernel_execute() {
-  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
-  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
-  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-
-  // CHECK: @FakePhiKernel@
-  %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
-}
-
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 5c1396d47f551618bcdf95ef55c875aa2cb0d684..28450ed6bd823f7d18eff19371a2a1a49292b329 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -1,8 +1,7 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
 func @load_tensor_map() {
-  %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
-  %map = dt.load_params(%path)
+  %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
   %size = dt.tensor_map_get_size(%map) -> i32
   Infrt.print.i32 %size
 
diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
similarity index 63%
rename from paddle/infrt/tests/dialect/disabled_trt_ops.mlir
rename to paddle/infrt/tests/dialect/trt_ops.mlir
index b59cfb04816974cbdb923e6d18af1184be963c59..6d25044d139f32c0a29adefb44c8fd2640cadd82 100644
--- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,12 +1,6 @@
+// RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main() -> tensor<?xf32> {
-  %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
-  %c = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
-  %b1 = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
-  %b2 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
-  %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
-  %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
-
+func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
   %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
@@ -18,5 +12,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
+  "infrt.return"(%e2) : (tensor<?xf32>)->()
 }
diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in
index 19ee0076b5594bf8c42d6888b28ef1fa172584ad..fe35dc4b8b3d436de5ec6893a8927eb49f1ab67d 100644
--- a/paddle/infrt/tests/lit.cfg.py.in
+++ b/paddle/infrt/tests/lit.cfg.py.in
@@ -21,10 +21,12 @@ build_dir = "@CMAKE_BINARY_DIR@"
 config.llvm_tools_dir = os.path.join(build_dir, "third_party/install/llvm/bin")
 config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib")
 infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/")
+trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/")
 infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/")
+phi_ir_exec_bin = os.path.join(build_dir, "paddle/infrt/dialect/phi")
 
 llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/")
 config.environment['PATH'] = os.path.pathsep.join(
-    (infrtopt_bin, infrtexec_bin, llvm_bin, config.environment['PATH']))
+    (infrtopt_bin, infrtexec_bin, trtexec_bin, phi_ir_exec_bin, llvm_bin, config.environment['PATH']))
 
 config.suffixes = ['.mlir']
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index d632db046d15ca73837292a5cb1e44479ab2c6ed..a1b0af609ca8d52f148b4ffa6016fdbc49862677 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(lib)
-cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api)
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 5edb83f8c3fc01d198d3f63b64047b9e45cd747b..42bf7a8103f837195775b33daf301a7d2e0f4c44 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -32,6 +32,30 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
 set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
 
+# sparse api file
+set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
+set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
+set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
+set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
+set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp)
+set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
+
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -73,6 +97,32 @@ add_custom_command(
   DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
   VERBATIM)
 
+# generate sparse api
+add_custom_command(
+  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
+                 --api_yaml_path ${sparse_api_yaml_file}
+                 --api_header_path ${sparse_api_header_file_tmp}
+                 --api_source_path ${sparse_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
+  COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file}
+  VERBATIM)
+
+# generate backward sparse api
+add_custom_command(
+  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file}
+                 --api_yaml_path ${sparse_bw_api_yaml_file}
+                 --api_header_path ${sparse_bw_api_header_file_tmp}
+                 --api_source_path ${sparse_bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file}
+  COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
+  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
+  VERBATIM)
+
 # generate wrapped infermeta
 add_custom_command(
   OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
@@ -87,12 +137,15 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
 cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
-cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 
-cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
-cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
-cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
-cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
+cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index c7400b93fcdc18314318fae9482e1e5e5bfb8aef..fc1afb26bf4143e5c75398b3dc1042581e1f1546 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
   std::vector<Tensor> out;
   auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
   std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_number);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_number);
   for (size_t i = 0; i < out_number; ++i) {
     meta_outs.push_back(dense_outs[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
   }
 
   phi::SplitInferMeta(
-      MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs);
+      MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_gen_utils.cc
similarity index 59%
rename from paddle/phi/api/lib/api_utils.h
rename to paddle/phi/api/lib/api_gen_utils.cc
index 6c1fa97c0f52a697383a3526220cc758d778823d..e1ebe8c6465cfdd7f8213c0a31416bc77412221c 100644
--- a/paddle/phi/api/lib/api_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 
 namespace paddle {
 namespace experimental {
 
 /* ------------------ for input ----------------------- */
 
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
-    const Tensor& tensor) {
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
     const paddle::optional<Tensor>& tensor) {
   if (tensor) {
     return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
   return nullptr;
 }
 
-inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
     const std::vector<Tensor>& tensors) {
   auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
   pt_tensors->reserve(tensors.size());
@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
   return std::move(pt_tensors);
 }
 
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const Tensor& tensor) {
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
     const paddle::optional<Tensor>& tensor) {
   if (tensor) {
     return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
 
 /* ----------------- for infer_meta --------------------- */
 
-inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::DenseTensor&>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
@@ -79,21 +70,21 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
   return {paddle::none};
 }
 
-inline std::vector<phi::MetaTensor> MakeMetaTensor(
-    const std::vector<phi::DenseTensor>& tensors) {
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::DenseTensor*>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
   meta_tensors.reserve(tensors.size());
-  for (const auto& t : tensors) {
-    meta_tensors.emplace_back(t);
+  for (const auto* t : tensors) {
+    meta_tensors.emplace_back(*t);
   }
   return meta_tensors;
 }
 
-inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::SelectedRows&>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
 
 /* ------------------ for output ----------------------- */
 
-inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto dense_tensor = std::make_shared<phi::DenseTensor>(
         phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
 
-inline std::vector<phi::DenseTensor*> SetKernelOutput(
-    size_t out_size, Backend backend, std::vector<Tensor>* out) {
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out) {
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
   return results;
 }
 
-inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
-                                                      Tensor* out) {
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto select_rows = std::make_shared<phi::SelectedRows>();
     out->set_impl(select_rows);
@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
   return static_cast<phi::SelectedRows*>(out->impl().get());
 }
 
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
+  if (!out->initialized()) {
+    if (type == TensorType::SPARSE_COO) {
+      auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
+          phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else if (type == TensorType::SPARSE_CSR) {
+      auto sparse_tensor =
+          std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else {
+      auto dense_tensor = std::make_shared<phi::DenseTensor>();
+      out->set_impl(dense_tensor);
+      return dense_tensor.get();
+    }
+  }
+  return out->impl().get();
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..01625f651c3bd1deaae43f735ac03fb2bc3f4e25
--- /dev/null
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
+
+/* ------------------ for input ----------------------- */
+
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
+
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor);
+
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+    const std::vector<Tensor>& tensors);
+
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
+
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor);
+
+/* ----------------- for infer_meta --------------------- */
+
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
+
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor);
+
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::DenseTensor*>& tensors);
+
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
+
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor);
+
+/* ------------------ for output ----------------------- */
+
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
+
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out);
+
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
+
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index ae67e2ebb35ccef7fe07ee8c76db33a459b1dfce..79b8ac6d0b8352b2e817e6bdbefca74c835ad6b2 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
similarity index 86%
rename from paddle/phi/api/lib/sparse_api.cc
rename to paddle/phi/api/lib/sparse_api_custom_impl.cc
index 9e1f59c0aa74329b15efcbff123b137fbf0b1360..832c19361e5eb03419fe988c9a30304b5993afdf 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 
 #include <memory>
 #include "glog/logging.h"
@@ -20,31 +20,14 @@ limitations under the License. */
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
-#endif
 
 namespace paddle {
 namespace experimental {
 namespace sparse {
 
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
-                                Backend backend,
-                                const int64_t sparse_dim) {
+Tensor to_sparse_coo_impl(const Tensor& x,
+                          Backend backend,
+                          const int64_t sparse_dim) {
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     return x;
   }
@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   return out;
 }
 
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     return x;
   }
@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   return out;
 }
 
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x, Backend backend) {
   if (x.layout() != phi::DataLayout::SPARSE_CSR &&
       x.layout() != phi::DataLayout::SPARSE_COO) {
     return x;
diff --git a/paddle/phi/api/include/sparse_api.h b/paddle/phi/api/lib/sparse_api_custom_impl.h
similarity index 74%
rename from paddle/phi/api/include/sparse_api.h
rename to paddle/phi/api/lib/sparse_api_custom_impl.h
index a131804cd6f582c01586671a21851066910b21d4..293b2cfa3d33480ccccd0f601f8e15c639b93e1e 100644
--- a/paddle/phi/api/include/sparse_api.h
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.h
@@ -21,13 +21,13 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
-                                Backend backend,
-                                const int64_t sparse_dim);
+Tensor to_dense_impl(const Tensor& x, Backend backend);
 
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend);
+Tensor to_sparse_coo_impl(const Tensor& x,
+                          Backend backend,
+                          const int64_t sparse_dim);
 
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
 
 }  // namespace sparse
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 31325e22afae31e55a3a2d939739d6745ccd3d36..1c9f7c3a8683daaf26cb87b23e50284d0329c4a8 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
   return result;
 }
 
-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def) {
-  VLOG(5) << "ResetTensor by TensorArgDef.";
-  if (phi::DenseTensor::classof(dst)) {
-    auto* dense_t = static_cast<phi::DenseTensor*>(dst);
-    auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else if (phi::SelectedRows::classof(dst)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
-    auto* meta =
-        phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Unsupported tensor type is received when reseting tensor dtype and "
-        "layout by argument definition."));
-  }
-}
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 8b30d5421ab943d568a046ca0fe4698849780ffd..64df59c1a2a2de3f72ce46874fe07df70d33599e 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
 phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def);
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 43e477ef32e9c2a3d914447d610cd6f07b73a92a..5f616155546450b7f51c072307d700bd23920636 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -24,4 +24,11 @@ endif()
 
 if(WITH_CUSTOM_DEVICE)
   add_dependencies(phi_context custom_context)
+  cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
+  cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
+  cc_library(stream SRCS stream.cc DEPS callback_manager)
+  cc_library(event SRCS event.cc DEPS enforce place)
+  cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
+  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
+  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
 endif()
diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
similarity index 84%
rename from paddle/fluid/platform/device/callback_manager.cc
rename to paddle/phi/backends/callback_manager.cc
index c677bc0262f0cfba0a5995afbde9e04f4bb0337e..e21e8502d8f8c43e7484982354c4ea69253a195f 100644
--- a/paddle/fluid/platform/device/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/phi/backends/callback_manager.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 CallbackManager::CallbackManager(stream::Stream *stream)
     : stream_(stream), thread_pool_(1) {}
@@ -32,12 +31,12 @@ void CallbackManager::AddCallback(std::function<void()> callback) const {
     });
   });
 
-  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+  phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->AddCallback(stream_, func);
 }
 
 void CallbackManager::Wait() const {
-  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+  phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->SynchronizeStream(stream_);
 
   {
@@ -48,5 +47,4 @@ void CallbackManager::Wait() const {
   }
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/phi/backends/callback_manager.h
similarity index 91%
rename from paddle/fluid/platform/device/callback_manager.h
rename to paddle/phi/backends/callback_manager.h
index 0edc694c94bb7846ac6081bccc0dc7fecd61adcb..359958b7c93e2c4041532a377f35836ca8ae89bc 100644
--- a/paddle/fluid/platform/device/callback_manager.h
+++ b/paddle/phi/backends/callback_manager.h
@@ -30,10 +30,7 @@
 #include <memory>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
+namespace phi {
 
 namespace stream {
 class Stream;
@@ -58,5 +55,4 @@ class CallbackManager {
   mutable std::future<void> last_future_;
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index cb54d3675687d9ae7145c9ac01bc874e811b08f7..5b46afb4ce9ee942fa00910b247a86b2aa6a3df7 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -1,3 +1,5 @@
 if (WITH_CUSTOM_DEVICE)
   cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager)
+  cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
+  cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context)
 endif()
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index bde3b6a08539b51e06442ef6090f99cbea7e9de9..e34e0f94b7067cecc59aacfdde7bb10f2c3b5b59 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/backends/custom/custom_context.h"
 
-#include "paddle/fluid/platform/device/device_guard.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 
 namespace phi {
 
@@ -25,8 +25,8 @@ struct CustomContext::Impl {
   ~Impl() {}
 
   void Init() {
-    paddle::platform::DeviceGuard guard(place_);
-    stream_.reset(new paddle::platform::stream::Stream());
+    phi::DeviceGuard guard(place_);
+    stream_.reset(new phi::stream::Stream());
     stream_->Init(place_);
   }
 
@@ -40,7 +40,7 @@ struct CustomContext::Impl {
 
   Place place_;
 
-  std::shared_ptr<paddle::platform::stream::Stream> stream_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 
 void CustomContext::Init() { impl_->Init(); }
diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
similarity index 81%
rename from paddle/fluid/platform/device/custom/custom_device.cc
rename to paddle/phi/backends/custom/custom_device.cc
index 09f0421a878ad8f1e3b4292515b19dde232868dc..df757b286a6b18190f314b0a908cb04a23afcbdb 100644
--- a/paddle/fluid/platform/device/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -12,23 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_base.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device/custom/enforce_custom.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/device_base.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 
 static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
   return d1.id == d2.id;
 }
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class CustomDevice : public DeviceInterface {
  public:
-  CustomDevice(const std::string& type, int priority, bool is_custom,
-               std::unique_ptr<C_DeviceInterface> pimpl, void* dso_handle)
+  CustomDevice(const std::string& type,
+               int priority,
+               bool is_custom,
+               std::unique_ptr<C_DeviceInterface> pimpl,
+               void* dso_handle)
       : DeviceInterface(type, priority, is_custom),
         pimpl_(std::move(pimpl)),
         dso_handle_(dso_handle) {
@@ -122,14 +127,15 @@ class CustomDevice : public DeviceInterface {
     return device.id;
   }
 
-  void CreateStream(size_t dev_id, stream::Stream* stream,
+  void CreateStream(size_t dev_id,
+                    stream::Stream* stream,
                     const stream::Stream::Priority& priority =
                         stream::Stream::Priority::kNormal,
                     const stream::Stream::Flag& flag =
                         stream::Stream::Flag::kDefaultFlag) override {
     if (priority != stream::Stream::Priority::kNormal ||
         flag != stream::Stream::Flag::kDefaultFlag) {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "priority != stream::Stream::Priority::kNormal || flag != "
           "stream::Stream::Flag::kDefaultFlag is not allowed on "
           "CustomDevice."));
@@ -162,23 +168,28 @@ class CustomDevice : public DeviceInterface {
       SynchronizeStream(dev_id, stream);
       return true;
     }
-    if (pimpl_->query_stream(device, reinterpret_cast<C_Stream>(
-                                         stream->raw_stream())) == C_SUCCESS) {
+    if (pimpl_->query_stream(
+            device, reinterpret_cast<C_Stream>(stream->raw_stream())) ==
+        C_SUCCESS) {
       return true;
     }
     return false;
   }
 
-  void AddCallback(size_t dev_id, stream::Stream* stream,
+  void AddCallback(size_t dev_id,
+                   stream::Stream* stream,
                    stream::Stream::Callback* callback) override {
     if (!pimpl_->stream_add_callback) {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "AddCallback is not supported on %s.", Type()));
     } else {
       const auto device = &devices_pool[dev_id];
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback(
-          device, reinterpret_cast<C_Stream>(stream->raw_stream()),
-          [](C_Device device, C_Stream stream, void* user_data,
+          device,
+          reinterpret_cast<C_Stream>(stream->raw_stream()),
+          [](C_Device device,
+             C_Stream stream,
+             void* user_data,
              C_Status* status) {
             std::unique_ptr<std::function<void()>> func(
                 reinterpret_cast<std::function<void()>*>(user_data));
@@ -188,7 +199,8 @@ class CustomDevice : public DeviceInterface {
     }
   }
 
-  void CreateEvent(size_t dev_id, event::Event* event,
+  void CreateEvent(size_t dev_id,
+                   event::Event* event,
                    event::Event::Flag flags) override {
     const auto device = &devices_pool[dev_id];
     C_Event c_event;
@@ -205,13 +217,15 @@ class CustomDevice : public DeviceInterface {
         device, reinterpret_cast<C_Event>(event->raw_event())));
   }
 
-  void RecordEvent(size_t dev_id, const event::Event* event,
+  void RecordEvent(size_t dev_id,
+                   const event::Event* event,
                    const stream::Stream* stream) override {
     const auto device = &devices_pool[dev_id];
 
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
-        reinterpret_cast<C_Event>(event->raw_event())));
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->record_event(device,
+                             reinterpret_cast<C_Stream>(stream->raw_stream()),
+                             reinterpret_cast<C_Event>(event->raw_event())));
   }
 
   void SynchronizeEvent(size_t dev_id, const event::Event* event) override {
@@ -228,78 +242,93 @@ class CustomDevice : public DeviceInterface {
       SynchronizeEvent(dev_id, event);
       return true;
     }
-    if (pimpl_->query_event(device, reinterpret_cast<C_Event>(
-                                        event->raw_event())) == C_SUCCESS) {
+    if (pimpl_->query_event(device,
+                            reinterpret_cast<C_Event>(event->raw_event())) ==
+        C_SUCCESS) {
       return true;
     }
     return false;
   }
 
-  void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+  void StreamWaitEvent(size_t dev_id,
+                       const stream::Stream* stream,
                        const event::Event* event) override {
     const auto device = &devices_pool[dev_id];
 
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        device,
+        reinterpret_cast<C_Stream>(stream->raw_stream()),
         reinterpret_cast<C_Event>(event->raw_event())));
   }
 
-  void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyH2D(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_h2d(device, dst, src, size));
     }
   }
 
-  void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyD2H(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_d2h(device, dst, src, size));
     }
   }
 
-  void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyD2D(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_d2d(device, dst, src, size));
     }
   }
 
-  void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id,
-                     const void* src, size_t size,
+  void MemoryCopyP2P(const Place& dst_place,
+                     void* dst,
+                     size_t src_dev_id,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     int dst_dev_id = PlaceToId(dst_place);
     auto dst_device = &devices_pool[dst_dev_id];
@@ -310,8 +339,12 @@ class CustomDevice : public DeviceInterface {
         MemoryCopyP2P(dst_place, dst, src_dev_id, src, size);
       } else {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p(
-            dst_device, src_device,
-            reinterpret_cast<C_Stream>(stream->raw_stream()), dst, src, size));
+            dst_device,
+            src_device,
+            reinterpret_cast<C_Stream>(stream->raw_stream()),
+            dst,
+            src,
+            size));
       }
     } else {
       if (!pimpl_->memory_copy_p2p) {
@@ -319,9 +352,9 @@ class CustomDevice : public DeviceInterface {
         MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
         MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
       } else {
-        auto src_place = platform::CustomPlace(Type(), src_dev_id);
-        platform::DeviceContextPool& pool =
-            platform::DeviceContextPool::Instance();
+        auto src_place = CustomPlace(Type(), src_dev_id);
+        paddle::platform::DeviceContextPool& pool =
+            paddle::platform::DeviceContextPool::Instance();
         pool.Get(src_place)->Wait();
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
@@ -350,8 +383,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_allocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryAllocateHost is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->host_memory_allocate(device, &ptr, size));
@@ -363,8 +396,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->host_memory_deallocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryDeallocateHost is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->host_memory_deallocate(device, ptr, size));
@@ -376,8 +409,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_allocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Unified is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryAllocateUnified is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->unified_memory_allocate(device, &ptr, size));
@@ -389,15 +422,17 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_deallocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryDeallocateUnified is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->unified_memory_deallocate(device, ptr, size));
     }
   }
 
-  void MemorySet(size_t dev_id, void* ptr, uint8_t value,
+  void MemorySet(size_t dev_id,
+                 void* ptr,
+                 uint8_t value,
                  size_t size) override {
     const auto device = &devices_pool[dev_id];
 
@@ -532,10 +567,12 @@ class CustomDevice : public DeviceInterface {
 
   inline int PlaceToId(const Place& place) {
     int dev_id = PlaceToIdNoCheck(place);
-    PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(),
-                      platform::errors::NotFound(
+    PADDLE_ENFORCE_NE(devices_pool.find(dev_id),
+                      devices_pool.end(),
+                      phi::errors::NotFound(
                           "Cannot found %s %d, please check visible devices",
-                          Type(), dev_id));
+                          Type(),
+                          dev_id));
     return dev_id;
   }
 
@@ -623,11 +660,14 @@ typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
-                          const std::string& dso_lib_path, void* dso_handle) {
+                          const std::string& dso_lib_path,
+                          void* dso_handle) {
   if (ValidCustomCustomRuntimeParams(&runtime_params)) {
-    auto device =
-        std::make_unique<CustomDevice>(runtime_params.device_type, 255, true,
-                                       std::move(device_interface), dso_handle);
+    auto device = std::make_unique<CustomDevice>(runtime_params.device_type,
+                                                 255,
+                                                 true,
+                                                 std::move(device_interface),
+                                                 dso_handle);
     if (false == DeviceManager::Register(std::move(device))) {
       LOG(WARNING) << "Skipped lib [" << dso_lib_path
                    << "]. Register failed!!! there may be a "
@@ -665,10 +705,9 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle) {
                     "compatibility between PaddlePaddle and Custom Runtime.";
     return;
   }
-  LoadCustomRuntimeLib(runtime_params, std::move(device_interface),
-                       dso_lib_path, dso_handle);
+  LoadCustomRuntimeLib(
+      runtime_params, std::move(device_interface), dso_lib_path, dso_handle);
   LOG(INFO) << "Successed in loading custom runtime in lib: " << dso_lib_path;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
similarity index 86%
rename from paddle/fluid/platform/device/custom/custom_device_test.cc
rename to paddle/phi/backends/custom/custom_device_test.cc
index e42fbbb9448f970aaf3e3821aba061fae7693756..53b88f9b4ac7904b04e3786aae02719ce06f9204 100644
--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -17,9 +17,9 @@
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
-#include "paddle/fluid/platform/device/device_manager.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/custom/fake_cpu_device.h"
+#include "paddle/phi/backends/device_manager.h"
 
 void RegisterDevice() {
   CustomRuntimeParams runtime_params;
@@ -30,23 +30,22 @@ void RegisterDevice() {
   runtime_params.interface->size = sizeof(C_DeviceInterface);
 
   InitFakeCPUDevice(&runtime_params);
-  paddle::platform::LoadCustomRuntimeLib(
+  phi::LoadCustomRuntimeLib(
       runtime_params, std::move(device_interface), "", nullptr);
 }
 
 void InitDevice() {
   RegisterDevice();
-  EXPECT_GT(static_cast<int>(
-                paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
+  EXPECT_GT(static_cast<int>(phi::DeviceManager::GetAllDeviceTypes().size()),
             0);
   auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
-  auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   EXPECT_NE(device, nullptr);
 
   std::vector<paddle::platform::Place> places;
-  auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  auto device_types = phi::DeviceManager::GetAllDeviceTypes();
   for (auto dev_type : device_types) {
-    auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
+    auto devices = phi::DeviceManager::GetDeviceList(dev_type);
     for (auto dev_id : devices) {
       places.push_back(
           paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
@@ -60,14 +59,14 @@ void InitDevice() {
 void TestDeviceInterface(const paddle::platform::Place& place) {
   std::cout << "TestDeviceInterface on " << place << std::endl;
   if (paddle::platform::is_custom_place(place)) {
-    auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+    auto device = phi::DeviceManager::GetDeviceWithPlace(place);
     auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
-    auto p1 = device->MemoryAllocate(
-        paddle::platform::DeviceManager::GetMinChunkSize(place));
+    auto p1 =
+        device->MemoryAllocate(phi::DeviceManager::GetMinChunkSize(place));
     EXPECT_NE(p1, nullptr);
 
-    paddle::platform::DeviceManager::SetDevice(place);
-    auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
+    phi::DeviceManager::SetDevice(place);
+    auto dev_id = phi::DeviceManager::GetDevice(dev_type);
     EXPECT_EQ(dev_id, place.GetDeviceId());
   }
 }
@@ -168,11 +167,10 @@ void TestTensorUtils(const paddle::platform::Place& place) {
 
 TEST(CustomDevice, Tensor) {
   InitDevice();
-  auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
   for (const auto& dev_type : dev_types) {
     std::cout << "Test on " << dev_type << std::endl;
-    EXPECT_GT(static_cast<int>(
-                  paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
+    EXPECT_GT(static_cast<int>(phi::DeviceManager::GetDeviceCount(dev_type)),
               0);
     auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
 
diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/phi/backends/custom/fake_cpu_device.h
similarity index 90%
rename from paddle/fluid/platform/device/custom/fake_cpu_device.h
rename to paddle/phi/backends/custom/fake_cpu_device.h
index c6d8ade4b08597b2c17e5df9dc333c3c4f70d69e..22c344a0e0488f9126b8bcd2f3ce99490e9e70bd 100644
--- a/paddle/fluid/platform/device/custom/fake_cpu_device.h
+++ b/paddle/phi/backends/custom/fake_cpu_device.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/phi/backends/device_ext.h"
 
 constexpr size_t global_total_memory = 1024 * 1024UL;
 static size_t global_free_memory = global_total_memory;
@@ -43,14 +43,19 @@ C_Status GetDevicesList(size_t *device) {
   return C_SUCCESS;
 }
 
-C_Status MemCpy(const C_Device device, void *dst, const void *src,
+C_Status MemCpy(const C_Device device,
+                void *dst,
+                const void *src,
                 size_t size) {
   memcpy(dst, src, size);
   return C_SUCCESS;
 }
 
-C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
-                     const void *src, size_t size) {
+C_Status AsyncMemCpy(const C_Device device,
+                     C_Stream stream,
+                     void *dst,
+                     const void *src,
+                     size_t size) {
   memcpy(dst, src, size);
   return C_SUCCESS;
 }
@@ -100,14 +105,16 @@ C_Status SyncStream(const C_Device device, C_Stream stream) {
 
 C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
 
-C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
+C_Status StreamWaitEvent(const C_Device device,
+                         C_Stream stream,
                          C_Event event) {
   return C_SUCCESS;
 }
 
 C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
 
-C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
+C_Status DeviceMemStats(const C_Device device,
+                        size_t *total_memory,
                         size_t *free_memory) {
   *total_memory = global_total_memory;
   *free_memory = global_free_memory;
@@ -139,7 +146,8 @@ void InitFakeCPUDevice(CustomRuntimeParams *params) {
   params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
   params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
 
-  memset(reinterpret_cast<void *>(params->interface), 0,
+  memset(reinterpret_cast<void *>(params->interface),
+         0,
          sizeof(C_DeviceInterface));
 
   params->interface->initialize = Init;
diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/phi/backends/device_base.cc
similarity index 68%
rename from paddle/fluid/platform/device/device_base.cc
rename to paddle/phi/backends/device_base.cc
index 6234c9612687e507acd2642ef1d39cc0f8da4539..14fe90192e5bcaedf50ba96a6a047d08f622a76d 100644
--- a/paddle/fluid/platform/device/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/phi/backends/device_base.h"
 #include "gflags/gflags.h"
+#include "paddle/phi/core/enforce.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -21,26 +22,25 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
-#define INTERFACE_UNIMPLEMENT                   \
-  PADDLE_THROW(platform::errors::Unimplemented( \
+#define INTERFACE_UNIMPLEMENT              \
+  PADDLE_THROW(phi::errors::Unimplemented( \
       "%s is not implemented on %s device.", __func__, Type()));
 
 // info
 size_t DeviceInterface::GetComputeCapability() {
-  VLOG(10) << Type() + " get compute capability " << 0;
+  VLOG(10) << Type() << " get compute capability " << 0;
   return 0;
 }
 
 size_t DeviceInterface::GetRuntimeVersion() {
-  VLOG(10) << Type() + " get runtime version " << 0;
+  VLOG(10) << Type() << " get runtime version " << 0;
   return 0;
 }
 
 size_t DeviceInterface::GetDriverVersion() {
-  VLOG(10) << Type() + " get driver version " << 0;
+  VLOG(10) << Type() << " get driver version " << 0;
   return 0;
 }
 
@@ -62,7 +62,8 @@ void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
 int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
 
 // stream manage
-void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
+void DeviceInterface::CreateStream(size_t dev_id,
+                                   stream::Stream* stream,
                                    const stream::Stream::Priority& priority,
                                    const stream::Stream::Flag& flag) {
   INTERFACE_UNIMPLEMENT;
@@ -82,7 +83,8 @@ bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
   return true;
 }
 
-void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
+void DeviceInterface::AddCallback(size_t dev_id,
+                                  stream::Stream* stream,
                                   stream::Stream::Callback* callback) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -94,7 +96,8 @@ void DeviceInterface::StreamWaitEvent(size_t dev_id,
 }
 
 // event manage
-void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
+void DeviceInterface::CreateEvent(size_t dev_id,
+                                  event::Event* event,
                                   event::Event::Flag flags) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -103,7 +106,8 @@ void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
+void DeviceInterface::RecordEvent(size_t dev_id,
+                                  const event::Event* event,
                                   const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -119,23 +123,35 @@ bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
 }
 
 // memery manage
-void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyH2D(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyD2H(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyD2D(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
-                                    size_t src_id, const void* src, size_t size,
+void DeviceInterface::MemoryCopyP2P(const Place& dst_place,
+                                    void* dst,
+                                    size_t src_id,
+                                    const void* src,
+                                    size_t size,
                                     const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -154,7 +170,8 @@ void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
   return nullptr;
 }
 
-void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
+void DeviceInterface::MemoryDeallocateHost(size_t dev_id,
+                                           void* ptr,
                                            size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -164,12 +181,15 @@ void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
   return nullptr;
 }
 
-void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
+void DeviceInterface::MemoryDeallocateUnified(size_t dev_id,
+                                              void* ptr,
                                               size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
+void DeviceInterface::MemorySet(size_t dev_id,
+                                void* ptr,
+                                uint8_t value,
                                 size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -184,8 +204,9 @@ size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
 
 size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t available_to_alloc = AvailableAllocSize(dev_id);
-  PADDLE_ENFORCE_GT(available_to_alloc, 0,
-                    platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_GT(available_to_alloc,
+                    0,
+                    phi::errors::ResourceExhausted(
                         "Not enough available %s memory.", Type()));
   // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
   // allocated by fraction
@@ -194,8 +215,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t alloc_bytes =
       (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
                                            FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
-                    platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_GE(available_to_alloc,
+                    alloc_bytes,
+                    phi::errors::ResourceExhausted(
                         "Not enough available %s memory.", Type()));
   return alloc_bytes;
 }
@@ -217,33 +239,32 @@ size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
 
 size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
   size_t init_alloc_size = AllocSize(dev_id, false);
-  VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
+  VLOG(10) << Type() << " init alloc size " << (init_alloc_size >> 20) << "M";
   return init_alloc_size;
 }
 
 size_t DeviceInterface::GetReallocSize(size_t dev_id) {
   size_t realloc_size = AllocSize(dev_id, true);
-  VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
+  VLOG(10) << Type() << " realloc size " << (realloc_size >> 20) << "M";
   return realloc_size;
 }
 
 size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
   size_t max_alloc_size =
       std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
-  VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
+  VLOG(10) << Type() << " max alloc size " << (max_alloc_size >> 20) << "M";
   return max_alloc_size;
 }
 
 size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
   size_t max_chunk_size = GetMaxAllocSize(dev_id);
-  VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
+  VLOG(10) << Type() << " max chunk size " << (max_chunk_size >> 20) << "M";
   return max_chunk_size;
 }
 
 size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
-  VLOG(10) << Type() + " extra padding size " << 0;
+  VLOG(10) << Type() << " extra padding size " << 0;
   return 0;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/phi/backends/device_base.h
similarity index 80%
rename from paddle/fluid/platform/device/device_base.h
rename to paddle/phi/backends/device_base.h
index d70b02be80eacd9d492b8a8d40c0a074dfe9c6e3..b4964708dfb9797c75e6f69ccb8bae6853b424a9 100644
--- a/paddle/fluid/platform/device/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -14,11 +14,10 @@
 
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class DeviceInterface {  // Driver / Runtime
  public:
@@ -66,7 +65,8 @@ class DeviceInterface {  // Driver / Runtime
   // Stream
   // ! Create an asynchronous stream
   virtual void CreateStream(
-      size_t dev_id, stream::Stream* stream,
+      size_t dev_id,
+      stream::Stream* stream,
       const stream::Stream::Priority& priority =
           stream::Stream::Priority::kNormal,
       const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
@@ -81,19 +81,22 @@ class DeviceInterface {  // Driver / Runtime
   virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
 
   // ! Add a callback to a compute stream.
-  virtual void AddCallback(size_t dev_id, stream::Stream* stream,
+  virtual void AddCallback(size_t dev_id,
+                           stream::Stream* stream,
                            stream::Stream::Callback* callback);
 
   // Event
   // ! Create an event.
-  virtual void CreateEvent(size_t dev_id, event::Event* event,
+  virtual void CreateEvent(size_t dev_id,
+                           event::Event* event,
                            event::Event::Flag flags);
 
   // ! Destroy an event.
   virtual void DestroyEvent(size_t dev_id, event::Event* event);
 
   // ! Records an event.
-  virtual void RecordEvent(size_t dev_id, const event::Event* event,
+  virtual void RecordEvent(size_t dev_id,
+                           const event::Event* event,
                            const stream::Stream* stream);
 
   // ! Waits for event to complete.
@@ -102,24 +105,34 @@ class DeviceInterface {  // Driver / Runtime
   virtual bool QueryEvent(size_t dev_id, const event::Event* event);
 
   // ! Make a compute stream wait on an event
-  virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+  virtual void StreamWaitEvent(size_t dev_id,
+                               const stream::Stream* stream,
                                const event::Event* event);
 
   // Memory
-  virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyH2D(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyD2H(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyD2D(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
-                             const void* src, size_t size,
+  virtual void MemoryCopyP2P(const Place& dst_place,
+                             void* dst,
+                             size_t src_id,
+                             const void* src,
+                             size_t size,
                              const stream::Stream* stream = nullptr);
 
   virtual void* MemoryAllocate(size_t dev_id, size_t size);
@@ -160,7 +173,6 @@ class DeviceInterface {  // Driver / Runtime
   size_t AvailableAllocSize(size_t dev_id);
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/phi/backends/device_ext.h
similarity index 78%
rename from paddle/fluid/platform/device/device_ext.h
rename to paddle/phi/backends/device_ext.h
index d1e1340f74b7741f867b85d7ab0b1e42c9621a47..bbd4966b7274f88ad4fad47dfdf7ce8e50ae2a3a 100644
--- a/paddle/fluid/platform/device/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -40,7 +40,9 @@ typedef struct C_Stream_st* C_Stream;
 
 typedef struct C_Event_st* C_Event;
 
-typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
+typedef void (*C_Callback)(C_Device device,
+                           C_Stream stream,
+                           void* user_data,
                            C_Status* status);
 
 struct C_DeviceInterface {
@@ -124,8 +126,10 @@ struct C_DeviceInterface {
    * @param[C_Callback] callback
    * @param[void*]      user_data
    */
-  C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
-                                  C_Callback callback, void* user_data);
+  C_Status (*stream_add_callback)(const C_Device device,
+                                  C_Stream stream,
+                                  C_Callback callback,
+                                  void* user_data);
 
   /**
    * @brief Create an event
@@ -142,7 +146,8 @@ struct C_DeviceInterface {
    * @param[C_Stream]   stream
    * @param[C_Event]    event
    */
-  C_Status (*record_event)(const C_Device device, C_Stream stream,
+  C_Status (*record_event)(const C_Device device,
+                           C_Stream stream,
                            C_Event event);
 
   /**
@@ -191,7 +196,8 @@ struct C_DeviceInterface {
    * @param[C_Stream]   stream
    * @param[C_Event]    event
    */
-  C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
+  C_Status (*stream_wait_event)(const C_Device device,
+                                C_Stream stream,
                                 C_Event event);
 
   void* reserved_dev_api[8];
@@ -207,7 +213,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*device_memory_allocate)(const C_Device device,
+                                     void** ptr,
                                      size_t size);
 
   /**
@@ -217,7 +224,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*device_memory_deallocate)(const C_Device device,
+                                       void* ptr,
                                        size_t size);
 
   /**
@@ -228,8 +236,10 @@ struct C_DeviceInterface {
    * @param[unsigned char] value
    * @param[size_t]     size
    */
-  C_Status (*device_memory_set)(const C_Device device, void* ptr,
-                                unsigned char value, size_t size);
+  C_Status (*device_memory_set)(const C_Device device,
+                                void* ptr,
+                                unsigned char value,
+                                size_t size);
 
   /**
    * @brief Host memory allocate
@@ -238,7 +248,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*host_memory_allocate)(const C_Device device,
+                                   void** ptr,
                                    size_t size);
 
   /**
@@ -248,7 +259,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*host_memory_deallocate)(const C_Device device,
+                                     void* ptr,
                                      size_t size);
 
   /**
@@ -258,7 +270,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*unified_memory_allocate)(const C_Device device,
+                                      void** ptr,
                                       size_t size);
 
   /**
@@ -268,7 +281,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*unified_memory_deallocate)(const C_Device device,
+                                        void* ptr,
                                         size_t size);
 
   /**
@@ -279,7 +293,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_h2d)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -290,7 +306,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_d2h)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -301,7 +319,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_d2d)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -314,8 +334,10 @@ struct C_DeviceInterface {
    * @param[size_t]     size
    */
   C_Status (*memory_copy_p2p)(const C_Device dst_device,
-                              const C_Device src_device, void* dst,
-                              const void* src, size_t size);
+                              const C_Device src_device,
+                              void* dst,
+                              const void* src,
+                              size_t size);
 
   /**
    * @brief Asynchonrize memory copy from host to device
@@ -326,8 +348,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_h2d)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Asynchonrize memory copy from device to host
@@ -338,8 +363,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_d2h)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Asynchonrize memory copy from device to device
@@ -350,8 +378,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_d2d)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Peer asynchonrize memory copy from host to device
@@ -363,8 +394,11 @@ struct C_DeviceInterface {
    * @param[size_t]     size
    */
   C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
-                                    const C_Device src_device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+                                    const C_Device src_device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   void* reserved_mem_api[8];
 
@@ -394,7 +428,8 @@ struct C_DeviceInterface {
    * @param[size_t*]    free_memory
    * @param[size_t*]    used_memory
    */
-  C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
+  C_Status (*device_memory_stats)(const C_Device device,
+                                  size_t* total_memory,
                                   size_t* free_memory);
 
   /**
diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/phi/backends/device_guard.cc
similarity index 83%
rename from paddle/fluid/platform/device/device_guard.cc
rename to paddle/phi/backends/device_guard.cc
index 55d8b9dc6a9a58dda5ae8192709e6858da878da7..03eaac1fb1aa15f36c55d466e8a79cea53035e51 100644
--- a/paddle/fluid/platform/device/device_guard.cc
+++ b/paddle/phi/backends/device_guard.cc
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/device_guard.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 // Even this source file does not contains any code, it is better to keep this
 // source file for cmake dependency.
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/phi/backends/device_guard.h
similarity index 82%
rename from paddle/fluid/platform/device/device_guard.h
rename to paddle/phi/backends/device_guard.h
index 638e9c984b4d25e474fd5949e9fdc5df98a344ef..eb14236d251b34a83e22d56bc7406b21cd5a84c5 100644
--- a/paddle/fluid/platform/device/device_guard.h
+++ b/paddle/phi/backends/device_guard.h
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class DeviceGuard {
  public:
   explicit inline DeviceGuard(const Place& place)
-      : dev_type_(PlaceHelper::GetDeviceType(place)) {
+      : dev_type_(place.GetDeviceType()) {
     prev_id = DeviceManager::GetDevice(dev_type_);
-    cur_id = PlaceHelper::GetDeviceId(place);
+    cur_id = place.GetDeviceId();
 
     if (cur_id != prev_id) {
       DeviceManager::SetDevice(dev_type_, cur_id);
@@ -44,5 +43,4 @@ class DeviceGuard {
   std::string dev_type_;
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/phi/backends/device_manager.cc
similarity index 83%
rename from paddle/fluid/platform/device/device_manager.cc
rename to paddle/phi/backends/device_manager.cc
index e0db97adde8e3500b0c6266464ceca62a6cf36ca..1ffe38d8e1f4ce59aa819a5eaa46c75d5fded5b0 100644
--- a/paddle/fluid/platform/device/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 
 #if !defined(_WIN32)
 #include <dirent.h>
@@ -24,8 +24,7 @@
 #include <functional>
 #include <regex>
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 void Device::CreateStream(stream::Stream* stream,
                           const stream::Stream::Priority& priority,
@@ -76,23 +75,32 @@ void Device::StreamWaitEvent(const stream::Stream* stream,
   impl_->StreamWaitEvent(dev_id_, stream, event);
 }
 
-void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
+void Device::MemoryCopyH2D(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
+void Device::MemoryCopyD2H(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
+void Device::MemoryCopyD2D(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
-                           size_t size, const stream::Stream* stream) {
+void Device::MemoryCopyP2P(const Place& dst_place,
+                           void* dst,
+                           const void* src,
+                           size_t size,
+                           const stream::Stream* stream) {
   impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
 }
 
@@ -173,7 +181,7 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
   } else {
     LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
     PADDLE_THROW(
-        platform::errors::Fatal("Unregistered device type %s.", device_type));
+        phi::errors::Fatal("Unregistered device type %s.", device_type));
     return nullptr;
   }
 }
@@ -182,17 +190,21 @@ Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
   phi::AutoRDLock lock(&_global_device_manager_rw_lock);
 
   auto& dev_map = Instance().device_map_;
-  auto dev_type = PlaceHelper::GetDeviceType(place);
-  auto dev_id = PlaceHelper::GetDeviceId(place);
-  PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
-                    platform::errors::NotFound(
-                        "Unable to find Device with type %s.", dev_type));
+  auto dev_type = place.GetDeviceType();
+  auto dev_id = place.GetDeviceId();
+  PADDLE_ENFORCE_NE(
+      dev_map.find(dev_type),
+      dev_map.end(),
+      phi::errors::NotFound("Unable to find Device with type %s.", dev_type));
   auto& dev_vec = dev_map[dev_type];
   PADDLE_ENFORCE_LT(
-      dev_id, dev_vec.size(),
-      platform::errors::OutOfRange(
+      dev_id,
+      dev_vec.size(),
+      phi::errors::OutOfRange(
           "The visible devices count of type %s is %d, but dev_id is %d.",
-          dev_type, dev_vec.size(), dev_id));
+          dev_type,
+          dev_vec.size(),
+          dev_id));
   return dev_vec[dev_id].get();
 }
 
@@ -277,22 +289,22 @@ void DeviceManager::Finalize(const std::string& device_type) {
 }
 
 void DeviceManager::SynchronizeDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->SynchronizeDevice(device_id);
 }
 
 void DeviceManager::InitDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->InitDevice(device_id);
 }
 
 void DeviceManager::DeInitDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->DeInitDevice(device_id);
 }
@@ -304,8 +316,8 @@ void DeviceManager::SetDevice(const std::string& device_type,
 }
 
 void DeviceManager::SetDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   DeviceManager::SetDevice(device_type, device_id);
 }
 
@@ -315,51 +327,52 @@ int DeviceManager::GetDevice(const std::string& device_type) {
 }
 
 size_t DeviceManager::GetMinChunkSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMinChunkSize(device_id);
 }
 
 size_t DeviceManager::GetMaxChunkSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMaxChunkSize(device_id);
 }
 
 size_t DeviceManager::GetMaxAllocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMaxAllocSize(device_id);
 }
 
 size_t DeviceManager::GetInitAllocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetInitAllocSize(device_id);
 }
 
 size_t DeviceManager::GetReallocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetReallocSize(device_id);
 }
 
 size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetExtraPaddingSize(device_id);
 }
 
-void DeviceManager::MemoryStats(const Place& place, size_t* total,
+void DeviceManager::MemoryStats(const Place& place,
+                                size_t* total,
                                 size_t* free) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->MemoryStats(device_id, total, free);
 }
@@ -393,8 +406,8 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   } else {
     while ((ptr = readdir(dir)) != nullptr) {
       std::string filename(ptr->d_name);
-      if (std::regex_match(filename.begin(), filename.end(), results,
-                           express)) {
+      if (std::regex_match(
+              filename.begin(), filename.end(), results, express)) {
         libraries.push_back(library_dir + '/' + filename);
         VLOG(4) << "Found lib: " << libraries.back();
       }
@@ -405,6 +418,5 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   return libraries;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 #endif
diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/phi/backends/device_manager.h
similarity index 83%
rename from paddle/fluid/platform/device/device_manager.h
rename to paddle/phi/backends/device_manager.h
index d3aaafcddf7c40f2b2c8fe4ce9057a04b4d6f9a5..c0911a0f8d50c52697b748f3726faded5a428694 100644
--- a/paddle/fluid/platform/device/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -15,17 +15,16 @@
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 
-#include "paddle/fluid/platform/device/device_base.h"
-#include "paddle/fluid/platform/device/device_ext.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/device_base.h"
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
+#include "paddle/phi/common/place.h"
 
 #include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 class Device final {
  public:
   Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
@@ -33,8 +32,9 @@ class Device final {
   // Stream
   // ! Create an asynchronous stream
   void CreateStream(
-      stream::Stream* stream, const stream::Stream::Priority& priority =
-                                  stream::Stream::Priority::kNormal,
+      stream::Stream* stream,
+      const stream::Stream::Priority& priority =
+          stream::Stream::Priority::kNormal,
       const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
 
   // ! Destroys an asynchronous stream.
@@ -69,17 +69,26 @@ class Device final {
   void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
 
   // Memory
-  void MemoryCopyH2D(void* dst, const void* src, size_t size,
+  void MemoryCopyH2D(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyD2H(void* dst, const void* src, size_t size,
+  void MemoryCopyD2H(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyD2D(void* dst, const void* src, size_t size,
+  void MemoryCopyD2D(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
-                     size_t size, const stream::Stream* stream = nullptr);
+  void MemoryCopyP2P(const Place& dst_place,
+                     void* dst,
+                     const void* src,
+                     size_t size,
+                     const stream::Stream* stream = nullptr);
 
   void* MemoryAllocate(size_t size);
 
@@ -168,7 +177,8 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
-                          const std::string& dso_lib_path, void* dso_handle);
+                          const std::string& dso_lib_path,
+                          void* dso_handle);
 
 class Registrar {
  public:
@@ -180,7 +190,6 @@ class Registrar {
   void Touch() {}
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 #endif
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index 75fc8fd9a3c6e4f4eed24f49ea4332aa2cfdf0a0..c81c66c69282fac3c8bbf4ef5e30c50c8592b1b0 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
+// Because lapack doesn't provide appropriate header file,
+// we should expose API statement yourself.
 
 // getrf_(For example)
 extern "C" void dgetrf_(
diff --git a/paddle/fluid/platform/device/event.cc b/paddle/phi/backends/event.cc
similarity index 84%
rename from paddle/fluid/platform/device/event.cc
rename to paddle/phi/backends/event.cc
index 6e6316ea16de020801a7afce6ad47f4b06eca022..a474536f865c16e9f808f49a2e24b26f102c75d1 100644
--- a/paddle/fluid/platform/device/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/event.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 namespace event {
 
 event_t Event::raw_event() const { return event_; }
@@ -27,7 +26,7 @@ void Event::set_event(event_t event) { event_ = event; }
 
 Event::Event(const Place& place, event_t event)
     : place_(place),
-      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      device_(phi::DeviceManager::GetDeviceWithPlace(place)),
       event_(event),
       own_data_(false) {}
 
@@ -60,5 +59,4 @@ void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
 const Place& Event::GetPlace() const { return place_; }
 
 }  // namespace event
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/event.h b/paddle/phi/backends/event.h
similarity index 91%
rename from paddle/fluid/platform/device/event.h
rename to paddle/phi/backends/event.h
index 376d73eb66660fdcdc0b2412d5d5e1371145e634..0866adcf39afa6259ccc424d93eb086427b39679 100644
--- a/paddle/fluid/platform/device/event.h
+++ b/paddle/phi/backends/event.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class Device;
 
@@ -57,5 +57,4 @@ class Event {
 };
 }  // namespace event
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index d0787159e1e308e2aed9b5b914d37d8a56e5e68b..33daa2bba6b7d9fba4b0815a86351da2947efd92 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,6 +57,9 @@ using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
 // Forward declaration of cuBLAS types.
 using cublasHandle_t = struct cublasContext *;
 
+// Forward declaration of cuBLASLt types.
+using cublasLtHandle_t = struct cublasLtContext *;
+
 // Forward declaration of cuSOLVER types.
 using cusolverDnHandle_t = struct cusolverDnContext *;
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index dbcc1660c6472cdddaaa3bea72854f61370c19a0..09deb575f2414a7a101c7f02d040ca1f4bd1a7f8 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -171,6 +172,7 @@ struct GPUContext::Impl {
     InitStream();
     InitEigenDevice();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -183,6 +185,7 @@ struct GPUContext::Impl {
     InitGpuProperties();
     InitStream();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -212,6 +215,7 @@ struct GPUContext::Impl {
     }
 #endif
     DestroyInternalBlasHandle();
+    DestroyInternalBlasLtHandle();
     DestoryInternalStream();
   }
 
@@ -418,6 +422,25 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
+  void InitBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtCreate(&blaslt_handle_);
+#endif
+  }
+
+  void DestroyInternalBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtDestroy(blaslt_handle_);
+#endif
+  }
+
+  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
+
+  blasLtHandle_t GetBlasLtHandle() const {
+    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
+    return blaslt_handle_;
+  }
+
   void InitDNNHandle() {
     if (phi::dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
@@ -679,6 +702,7 @@ struct GPUContext::Impl {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
   solverHandle_t solver_handle_{nullptr};
   sparseHandle_t sparse_handle_{nullptr};
@@ -725,6 +749,10 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
+blasLtHandle_t GPUContext::cublaslt_handle() const {
+  return impl_->GetBlasLtHandle();
+}
+
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
 }
@@ -815,6 +843,10 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
+  impl_->SetBlasLtHandle(blaslt);
+}
+
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 603ce0817c4ebdcb17bb97b14dd0700badcf2385..3eb4360ad35382369681308b46050cc3e6e04ea0 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -93,6 +94,9 @@ class GPUContext : public DeviceContext {
   /*! \brief  Return cublas handle in the device context. */
   blasHandle_t cublas_handle() const;
 
+  /*! \brief  Return cublasLt handle in the device context. */
+  blasLtHandle_t cublaslt_handle() const;
+
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
 
@@ -193,6 +197,8 @@ class GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasLtHandle(blasLtHandle_t);
+
   void SetDnnHandle(dnnHandle_t);
 
   void SetSolverHandle(solverHandle_t);
@@ -227,4 +233,12 @@ class GPUContext : public DeviceContext {
 // must use different function name for cudnn kernel
 using GPUDNNContext = GPUContext;
 
+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using KPSContext = GPUContext;
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 0be24392e1b40c5955d01f5d4c1834a2f46ce29d..4a6b9d2fd87f13e4b63d3da8e7e98f77d18c69a0 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,6 +60,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
 
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/phi/backends/stream.cc
similarity index 84%
rename from paddle/fluid/platform/device/stream.cc
rename to paddle/phi/backends/stream.cc
index 7f867e5ee7737d45f26a1967a3112c7075843454..30939f31fcc3c80931cf627451f46cb620dc21df 100644
--- a/paddle/fluid/platform/device/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/stream.h"
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/event.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/event.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 namespace stream {
 
 Stream::~Stream() { Destroy(); }
@@ -30,15 +29,16 @@ void Stream::set_stream(stream_t stream) { stream_ = stream; }
 // For compatiable
 Stream::Stream(const Place& place, stream_t stream)
     : place_(place),
-      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      device_(phi::DeviceManager::GetDeviceWithPlace(place)),
       stream_(stream),
       callback_manager_(new CallbackManager(this)),
       own_data_(false) {}
 
-bool Stream::Init(const Place& place, const Priority& priority,
+bool Stream::Init(const Place& place,
+                  const Priority& priority,
                   const Flag& flag) {
   place_ = place;
-  device_ = platform::DeviceManager::GetDeviceWithPlace(place);
+  device_ = phi::DeviceManager::GetDeviceWithPlace(place);
   DeviceGuard guard(place_);
   device_->CreateStream(this, priority, flag);
 
@@ -92,5 +92,4 @@ void Stream::Synchronize() const { device_->SynchronizeStream(this); }
 const Place& Stream::GetPlace() const { return place_; }
 
 }  // namespace stream
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/stream.h b/paddle/phi/backends/stream.h
similarity index 87%
rename from paddle/fluid/platform/device/stream.h
rename to paddle/phi/backends/stream.h
index 25cf705ee0951847bfda84b336d3579403e8ab37..d1578c90ec1a97f51646dd206d48df3146c35cee 100644
--- a/paddle/fluid/platform/device/stream.h
+++ b/paddle/phi/backends/stream.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/callback_manager.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class Device;
 
@@ -49,7 +49,8 @@ class Stream {
   ~Stream();
   const stream_t& raw_stream() const;
   void set_stream(stream_t stream);
-  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
+  bool Init(const Place& place,
+            const Priority& priority = Priority::kNormal,
             const Flag& flag = Flag::kDefaultFlag);
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
@@ -75,5 +76,4 @@ class Stream {
 };
 
 }  // namespace stream
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index 3005d1707e638a346c0d20e83a808c5c0da334e1..b87489c567cabea137850163879ed00d151f60cb 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
   std::unique_ptr<Impl> impl_;
 };
 
+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if PADDLE_WITH_XPU_KP
+using KPSContext = XPUContext;
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/common/amp_type_traits.h b/paddle/phi/common/amp_type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce3a469f5aeddc29e67e320141d2ebaab925fabd
--- /dev/null
+++ b/paddle/phi/common/amp_type_traits.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+namespace dtype {
+
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<phi::dtype::float16> {
+ public:
+  using Type = float;
+};
+
+template <>
+class MPTypeTrait<phi::dtype::bfloat16> {
+ public:
+  using Type = float;
+};
+
+}  // namespace dtype
+}  // namespace phi
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 4b7bf65be39cbc83688e7dab3fdd745c2be82b22..a9e12f5d81ed08328afad9e7da6d1e1999d47be1 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
   MKLDNN,
   GPUDNN,  // cuDNN and hipDNN
 
+  // paddle kernel primitives backend
+  KPS,
+
   // end of backend types
   NUM_BACKENDS,
 
@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::GPUDNN:
       os << "GPUDNN";
       break;
+    case Backend::KPS:
+      os << "KPS";
+      break;
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::MKLDNN;
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
+  } else if (s == std::string("KPS")) {
+    return Backend::KPS;
   } else {
     return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                 phi::GetOrRegisterGlobalDeviceTypeId(s));
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 3fd8eb1b2684a0d3b04c88549cb38698975ace2f..5f30ee4077b5c615d73a7446fff211a6c86d69b6 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -310,6 +310,10 @@ HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+HOSTDEVICE inline bfloat16(abs)(const bfloat16& a) {
+  return bfloat16(std::abs(static_cast<float>(a)));
+}
+
 inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
   os << static_cast<float>(a);
   return os;
@@ -377,31 +381,31 @@ struct numeric_limits<phi::dtype::bfloat16> {
   static const bool traps = true;
   static const bool tinyness_before = false;
 
-  static phi::dtype::bfloat16(min)() {
+  HOSTDEVICE static phi::dtype::bfloat16(min)() {
     return phi::dtype::raw_uint16_to_bfloat16(0x007f);
   }
-  static phi::dtype::bfloat16 lowest() {
+  HOSTDEVICE static phi::dtype::bfloat16 lowest() {
     return phi::dtype::raw_uint16_to_bfloat16(0xff7f);
   }
-  static phi::dtype::bfloat16(max)() {
+  HOSTDEVICE static phi::dtype::bfloat16(max)() {
     return phi::dtype::raw_uint16_to_bfloat16(0x7f7f);
   }
-  static phi::dtype::bfloat16 epsilon() {
+  HOSTDEVICE static phi::dtype::bfloat16 epsilon() {
     return phi::dtype::raw_uint16_to_bfloat16(0x3400);
   }
-  static phi::dtype::bfloat16 round_error() {
+  HOSTDEVICE static phi::dtype::bfloat16 round_error() {
     return phi::dtype::bfloat16(0.5);
   }
-  static phi::dtype::bfloat16 infinity() {
+  HOSTDEVICE static phi::dtype::bfloat16 infinity() {
     return phi::dtype::raw_uint16_to_bfloat16(0x7f80);
   }
-  static phi::dtype::bfloat16 quiet_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 quiet_NaN() {
     return phi::dtype::raw_uint16_to_bfloat16(0xffc1);
   }
-  static phi::dtype::bfloat16 signaling_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 signaling_NaN() {
     return phi::dtype::raw_uint16_to_bfloat16(0xff81);
   }
-  static phi::dtype::bfloat16 denorm_min() {
+  HOSTDEVICE static phi::dtype::bfloat16 denorm_min() {
     return phi::dtype::raw_uint16_to_bfloat16(0x0001);
   }
 };
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index d9dc103e48e737c1e88f6ae262566523494bff84..38239f0fa9dc1c2c2c5f67645745a1b0bf0281ac 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -82,7 +82,7 @@ inline size_t SizeOf(DataType data_type) {
   return 0;
 }
 
-#define PT_FOR_EACH_DATA_TYPE(_)    \
+#define PD_FOR_EACH_DATA_TYPE(_)    \
   _(bool, DataType::BOOL)           \
   _(int8_t, DataType::INT8)         \
   _(uint8_t, DataType::UINT8)       \
@@ -105,25 +105,25 @@ struct DataTypeToCppType;
 template <typename T>
 struct CppTypeToDataType;
 
-#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+#define PD_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
   template <>                                                \
   struct DataTypeToCppType<data_type> {                      \
     using type = cpp_type;                                   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCppType)
 
-#undef PT_SPECIALIZE_DataTypeToCppType
+#undef PD_SPECIALIZE_DataTypeToCppType
 
-#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+#define PD_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
   template <>                                                \
   struct CppTypeToDataType<cpp_type> {                       \
     constexpr static DataType Type() { return data_type; }   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_CppTypeToDataType)
 
-#undef PT_SPECIALIZE_CppTypeToDataType
+#undef PD_SPECIALIZE_CppTypeToDataType
 
 inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   switch (dtype) {
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 6ed9c88d705106ce3b03732096fa34b23422875f..1cdcdef2c12eec1c59c0fd2dfdf1c4dd6e62bd37 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -988,18 +988,6 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) {
   return os;
 }
 
-template <typename T>
-class MPTypeTrait {
- public:
-  using Type = T;
-};
-
-template <>
-class MPTypeTrait<float16> {
- public:
-  using Type = float;
-};
-
 }  // namespace dtype
 }  // namespace phi
 
diff --git a/paddle/phi/common/type_traits.h b/paddle/phi/common/type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef894eee4683538b2bd3fafee103e0db8b32782b
--- /dev/null
+++ b/paddle/phi/common/type_traits.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+
+namespace phi {
+namespace dtype {
+
+template <bool B, typename T>
+struct cond {
+  static constexpr bool value = B;
+  using type = T;
+};
+
+template <bool B, typename TrueF, typename FalseF>
+struct eval_if {
+  using type = typename TrueF::type;
+};
+
+template <typename TrueF, typename FalseF>
+struct eval_if<false, TrueF, FalseF> {
+  using type = typename FalseF::type;
+};
+
+template <bool B, typename T, typename F>
+using eval_if_t = typename eval_if<B, T, F>::type;
+
+template <typename Head, typename... Tail>
+struct select {
+  using type = eval_if_t<Head::value, Head, select<Tail...>>;
+};
+
+template <typename T>
+struct select<T> {
+  using type = T;
+};
+
+template <bool B, typename T>
+struct select<cond<B, T>> {
+  // last one had better be true!
+  static_assert(B, "No match select type!");
+  using type = T;
+};
+
+template <typename Head, typename... Tail>
+using select_t = typename select<Head, Tail...>::type;
+
+// runtime real and complex type conversion
+
+template <typename T>
+using Real = select_t<cond<std::is_same<T, complex<float>>::value, float>,
+                      cond<std::is_same<T, complex<double>>::value, double>,
+                      T>;
+
+template <typename T>
+using Complex = select_t<cond<std::is_same<T, float>::value, complex<float>>,
+                         cond<std::is_same<T, double>::value, complex<double>>,
+                         T>;
+
+inline DataType ToReal(DataType dtype) {
+  switch (dtype) {
+    case phi::DataType::COMPLEX64:
+      return phi::DataType::FLOAT32;
+    case phi::DataType::COMPLEX128:
+      return phi::DataType::FLOAT64;
+    default:
+      return dtype;
+  }
+}
+
+inline DataType ToComplex(DataType dtype) {
+  switch (dtype) {
+    case phi::DataType::FLOAT32:
+      return phi::DataType::COMPLEX64;
+    case phi::DataType::FLOAT64:
+      return phi::DataType::COMPLEX128;
+    default:
+      return dtype;
+  }
+}
+
+}  // namespace dtype
+}  // namespace phi
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index f4f57a0acbbb386a3642a05e0d0dc70cd082a4d8..b4a6b54d0fe3a96eea831a439f1555e14c367fa7 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -22,10 +22,10 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
-cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index af29b3bab5c3cc4b2e1caeb4eee9689179464d01..25b80279ecf10619d97b8800b24ab5353c79745d 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <tuple>
 
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
@@ -91,6 +92,17 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
+
+  // use this function to mark it comes from InferShapeArgumentMappingContext
+  // and will be used in infershape
+  virtual bool IsForInferShape() const = 0;
+
+  // NOTE(paddle-dev): [ Why do we export this interface? ]
+  // In old Fluid framework, some operators' Attribute can be a Tensor or
+  // TensorList. In this case, the InferShape logic will be different
+  // under CompileTime and RuntimeTime. So we export this interface to
+  // handle it conveniently. See "gaussian_random_sig.cc" for details.
+  virtual bool IsRuntime() const { return true; }
 };
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 3b7a733ede90464328600ebd3c7d371314b99cc3..67245f1da5a6b65cdf81d2cd329d7d6b3828852e 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/op_utils.h"
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 #endif
 
 namespace phi {
@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::XPU:
       return phi::XPUPlace(
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
+#endif
+    case phi::Backend::KPS:
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      return phi::GPUPlace(
+          set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
+#elif defined(PADDLE_WITH_XPU_KP)
+      return phi::XPUPlace(
+          set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     default: {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -75,9 +83,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
       if (!device_type.empty()) {
         return phi::CustomPlace(
             device_type,
-            set_device_id
-                ? paddle::platform::DeviceManager::GetDevice(device_type)
-                : 0);
+            set_device_id ? phi::DeviceManager::GetDevice(device_type) : 0);
       }
 #endif
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index bbf634b4b09b90a086505bc173b588d7da2e9668..00e9bff9bd5910ceedcca3dfb3a7a64ec88596df 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -40,15 +40,24 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
 const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "flatten",
                                                            "flatten_grad",
+                                                           "isinf",
+                                                           "isnan",
+                                                           "isfinite",
                                                            "matmul",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "max",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
+                                                           "expand_as",
                                                            "expand_grad",
-                                                           "sum"});
+                                                           "expand_as_grad",
+                                                           "sum",
+                                                           "sum_grad",
+                                                           "top_k",
+                                                           "top_k_grad"});
 
 class DefaultKernelSignatureMap {
  public:
@@ -166,7 +175,7 @@ struct ArgumentMappingFnRegistrar {
 };
 
 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
       PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
       "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
@@ -174,7 +183,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_ai_name_ns_check_##op_type,                              \
       "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
@@ -182,7 +191,7 @@ struct ArgumentMappingFnRegistrar {
       TouchBaseKernelNameSymbol_##op_type()
 
 #define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
       PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
       "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
@@ -190,7 +199,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
       PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
       "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index f84a2bd8d9c5d0634f29485fc07f649ea9fb1b9e..bc317da8d98ed4eb8abf8250f03c364b17c178b1 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -16,25 +16,38 @@
 
 namespace phi {
 
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
-  auto& kernel_info_map = custom_kernel_map.GetMap();
-  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+void CustomKernelMap::RegisterCustomKernel(const std::string& name,
+                                           const KernelKey& key,
+                                           const Kernel& kernel) {
+  PADDLE_ENFORCE_EQ(kernels_[name].find(key),
+                    kernels_[name].end(),
+                    phi::errors::AlreadyExists(
+                        "The custom kernel [%s:%s] has been already existed in "
+                        "CustomKernelMap, please check if any duplicate kernel "
+                        "info in your lib(s) before load again.",
+                        name,
+                        key));
+  kernels_[name][key] = kernel;
+}
+
+void CustomKernelMap::RegisterCustomKernels() {
+  VLOG(3) << "Size of custom_kernel_map: " << kernels_.size();
 
-  for (auto& pair : kernel_info_map) {
-    PADDLE_ENFORCE_EQ(
-        KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
-        true,
+  auto& kernels = KernelFactory::Instance().kernels();
+  for (auto& pair : kernels_) {
+    PADDLE_ENFORCE_NE(
+        kernels.find(pair.first),
+        kernels.end(),
         phi::errors::InvalidArgument(
             "The kernel %s is not ready for custom kernel registering.",
             pair.first));
 
     for (auto& info_pair : pair.second) {
-      auto& kernels = KernelFactory::Instance().kernels();
       PADDLE_ENFORCE_EQ(
           kernels[pair.first].find(info_pair.first),
           kernels[pair.first].end(),
-          phi::errors::InvalidArgument(
-              "The operator <%s>'s kernel: %s has been already existed "
+          phi::errors::AlreadyExists(
+              "The kernel [%s:%s] has been already existed "
               "in Paddle, please contribute PR if it is necessary "
               "to optimize the kernel code. Custom kernel does NOT support "
               "to replace existing kernel in Paddle.",
@@ -43,24 +56,14 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
 
       kernels[pair.first][info_pair.first] = info_pair.second;
 
-      VLOG(3) << "Successed in registering operator <" << pair.first
-              << ">'s kernel: " << info_pair.first
-              << " to Paddle. It will be used like native ones.";
+      VLOG(3) << "Successed in registering kernel [" << pair.first << ":"
+              << info_pair.first
+              << "] to Paddle. It will be used like native ones.";
     }
+    kernels_[pair.first].clear();
   }
+  LOG(INFO) << "Successed in loading custom kernels.";
+  kernels_.clear();
 }
 
 }  // namespace phi
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global CustomKernelMap.
-phi::CustomKernelMap& PD_GetCustomKernelMap() {
-  return phi::CustomKernelMap::Instance();
-}
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
index 20ae2b7bb7360ab6878617234784157584e01858..5ba14de6a6131c1b2e2fefb5d4eca306d0c4e0b3 100644
--- a/paddle/phi/core/custom_kernel.h
+++ b/paddle/phi/core/custom_kernel.h
@@ -29,6 +29,12 @@ class CustomKernelMap {
     return g_custom_kernel_info_map;
   }
 
+  void RegisterCustomKernel(const std::string& kernel_name,
+                            const KernelKey& kernel_key,
+                            const Kernel& kernel);
+
+  void RegisterCustomKernels();
+
   KernelNameMap& Kernels() { return kernels_; }
 
   const KernelNameMap& GetMap() const { return kernels_; }
@@ -40,10 +46,4 @@ class CustomKernelMap {
   KernelNameMap kernels_;
 };
 
-/**
- * Note:
- * Used to register custom kernels to KernelFactory.
- */
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
-
 }  // namespace phi
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 29e7dc01f32db20e3756677fe8a48fcb138b3883..5ee83089589e89b3cb29f095bd88fb16ff39d296 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
 // Note: When you reset holder, you need to ensure the offset is correct
 void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
   if (holder_) {
-    // TODO(zyfncg): The change of static_cast<> in check will recover back
-    // when SetAllocationForOutputTenosr is deleted.
-    // Now the numel() may return -1, and will cast to a very large number when
-    // compare with a data with unsigned long type, this will make checking
-    // failed, so it's a temporary solution to deal with this problem.
     PADDLE_ENFORCE_LE(
         numel() * static_cast<int64_t>(SizeOf(dtype())) +
             static_cast<int64_t>(meta_.offset),
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index f3dd056911ecf81d5ca0954114acbd1a3ac19ad9..671ba2ec7dc258865c01fff99ce97aacaeddd3cc 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
                : paddle::optional<const phi::MetaTensor&>{paddle::none};
 }
 
-std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
-                                                        size_t end) const {
-  std::vector<MetaTensor> result;
+std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
+                                                         size_t end) const {
+  std::vector<MetaTensor*> result;
   result.reserve(end - start);
 
   for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*inputs_.at(i));
+    result.push_back(inputs_.at(i).get());
   }
 
   return result;
@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
   return outputs_.at(idx).get();
 }
 
-std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start,
-                                                               size_t end) {
-  std::vector<MetaTensor> result;
+std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
+                                                                size_t end) {
+  std::vector<MetaTensor*> result;
   result.reserve(end - start);
   for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*outputs_.at(i));
+    result.emplace_back(outputs_.at(i).get());
   }
   return result;
 }
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 203dbb269841ec8616b94c89603af3904eb572c3..9c351ce9063ecf56b76a2eb351d1d2d23f13a83b 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -50,13 +50,13 @@ class InferMetaContext {
   const std::pair<int, int>& OutputRangeAt(size_t idx) const;
 
   const MetaConfig& GetMetaConfig() const;
-  const MetaTensor& InputAt(size_t idx) const;
 
+  const MetaTensor& InputAt(size_t idx) const;
   paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
+  std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;
 
-  std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
   MetaTensor* MutableOutputAt(size_t idx);
-  std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
+  std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);
 
   template <typename AttrType>
   AttrType AttrAt(size_t idx) {
@@ -86,10 +86,10 @@ class InferMetaContext {
   paddle::SmallVector<std::pair<int, int>> output_range_;
 };
 
-#define PT_INFER_META(...) \
+#define PD_INFER_META(...) \
   ::phi::InferMetaFnImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Call
 
-#define PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                                  \
   struct InferMetaFnCallHelper<attr_type, Tail...> {                           \
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> \
@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   template <typename... Tail>
-  struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
+  struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
       static_assert(out_idx == 0,
                     "InferMeta's Input should appear before Outputs.");
       const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      std::vector<MetaTensor> arg =
+      std::vector<MetaTensor*> arg =
           ctx->InputsBetween(range.first, range.second);
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
@@ -175,24 +175,24 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   // TODO(chenweihang): support other attr type later
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<int64_t>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<std::string>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
 
   // TODO(chenweihang): support vector<MetaTensor> input later
 
@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   template <typename... Tail>
-  struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> {
+  struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
       const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-      std::vector<MetaTensor> tmp =
+      std::vector<MetaTensor*> arg =
           ctx->MutableOutputBetween(range.first, range.second);
-      std::vector<MetaTensor>* arg = &tmp;
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
                                                                  pargs...,
@@ -305,11 +304,11 @@ struct InferMetaFnRegistrar {
 };
 
 #define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
       PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
       "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
-          #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
+          #kernel_name_prefix, PD_INFER_META(variadic_infer_meta_fn))
 
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 57e2db60c24caea8cbac323d9c47bdb53acc8a8c..213ac47d30bfdd28541bd1b9cb24bf2053b1c939 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -82,12 +82,11 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
-    std::vector<TensorType> v;
+  std::vector<const TensorType*> InputsBetween(size_t start, size_t end) {
+    std::vector<const TensorType*> v;
     for (size_t i = start; i < end; ++i) {
-      auto t = static_cast<const TensorType*>(inputs_.at(i));
-      v.emplace_back(*t);
-      inputs_[i] = nullptr;
+      auto* t = static_cast<const TensorType*>(inputs_.at(i));
+      v.emplace_back(t);
     }
     return v;
   }
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 6a1688947b986549e1feaf39cdf6c73749b0ff3a..d9ed68593cd610790ee4a0015069ac5a8cfea61b 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -81,19 +81,23 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-      } else if (arg_type ==
-                 std::type_index(typeid(const std::vector<DenseTensor>&))) {
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SelectedRows&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<const DenseTensor*>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#endif
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -105,13 +109,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
-#endif
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
@@ -208,7 +210,8 @@ struct KernelRegistrar {
     if (reg_type == RegType::INNER) {
       KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
     } else {
-      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+      CustomKernelMap::Instance().RegisterCustomKernel(
+          kernel_name, kernel_key, kernel);
     }
   }
 };
@@ -226,13 +229,13 @@ struct KernelRegistrar {
  *   http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement
  *   http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644
  */
-#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N()))
-#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
-#define _PT_ARG_N_EXPAND(                                                     \
+#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N()))
+#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__)
+#define _PD_ARG_N_EXPAND(                                                     \
     _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
   N
-#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
-#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args
+#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 /** PD_REGISTER_KERNEL
  *
@@ -254,10 +257,10 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_KERNEL(                                               \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
       PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_KERNEL must be called in global namespace.");           \
-  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
+  PD_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
                                     kernel_name,                           \
                                     backend,                               \
                                     context,                               \
@@ -268,19 +271,19 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_KERNEL_REGISTRAR_INIT(                                                 \
+  PD_KERNEL_REGISTRAR_INIT(                                                 \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__);                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 /**
@@ -297,119 +300,119 @@ struct KernelRegistrar {
  */
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+  PD_EXPAND(PD_KERNEL_REGISTRAR_INIT(                                       \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__));                                                        \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
-  _PT_KERNEL_INSTANTIATION(                                            \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PD_KERNEL_INSTANTIATION(                                            \
+      PD_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                             \
   (meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION_1(              \
+#define _PD_KERNEL_INSTANTIATION_1(              \
     meta_kernel_fn, backend, context, cpp_dtype) \
   template decltype(                             \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
-#define _PT_KERNEL_INSTANTIATION_2(                                           \
+#define _PD_KERNEL_INSTANTIATION_2(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(                                           \
+#define _PD_KERNEL_INSTANTIATION_3(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(                                           \
+#define _PD_KERNEL_INSTANTIATION_4(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(                                           \
+#define _PD_KERNEL_INSTANTIATION_5(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(                                           \
+#define _PD_KERNEL_INSTANTIATION_6(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(                                           \
+#define _PD_KERNEL_INSTANTIATION_7(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(                                           \
+#define _PD_KERNEL_INSTANTIATION_8(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(                                           \
+#define _PD_KERNEL_INSTANTIATION_9(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(                                          \
+#define _PD_KERNEL_INSTANTIATION_10(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(                                          \
+#define _PD_KERNEL_INSTANTIATION_11(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(                                          \
+#define _PD_KERNEL_INSTANTIATION_12(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(                                          \
+#define _PD_KERNEL_INSTANTIATION_13(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(                                          \
+#define _PD_KERNEL_INSTANTIATION_14(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(                                          \
+#define _PD_KERNEL_INSTANTIATION_15(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+#define PD_KERNEL_REGISTRAR_INIT(reg_type,                   \
                                  kernel_name,                \
                                  backend,                    \
                                  context,                    \
@@ -417,7 +420,7 @@ struct KernelRegistrar {
                                  args_def_fn,                \
                                  meta_kernel_fn,             \
                                  ...)                        \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT(PD_NARGS(__VA_ARGS__), \
                                       reg_type,              \
                                       kernel_name,           \
                                       backend,               \
@@ -431,7 +434,7 @@ struct KernelRegistrar {
 
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
-#define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+#define _PD_KERNEL_REGISTRAR_INIT(N,                       \
                                   reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
@@ -440,20 +443,20 @@ struct KernelRegistrar {
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
-  PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+  PD_EXPAND(PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \
     reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
     context,                                               \
     layout,                                                \
-    PT_ID,                                                 \
+    PD_ID,                                                 \
     args_def_fn,                                           \
     meta_kernel_fn,                                        \
     __VA_ARGS__))
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -462,7 +465,7 @@ struct KernelRegistrar {
                                     args_def_fn,                              \
                                     meta_kernel_fn,                           \
                                     cpp_dtype)                                \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -472,10 +475,10 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -485,7 +488,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -495,18 +498,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -516,7 +519,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -526,18 +529,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -547,7 +550,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -557,18 +560,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -578,7 +581,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -588,18 +591,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -609,7 +612,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -619,18 +622,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -640,7 +643,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -650,18 +653,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -671,7 +674,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -681,18 +684,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -702,7 +705,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -712,18 +715,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -733,7 +736,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -743,18 +746,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -764,7 +767,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -774,18 +777,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -795,7 +798,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -805,18 +808,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -826,7 +829,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -836,18 +839,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -857,7 +860,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -867,18 +870,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -888,7 +891,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -898,14 +901,14 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
@@ -922,7 +925,7 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_GENERAL_KERNEL(                                         \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
   __PD_REGISTER_GENERAL_KERNEL(                                              \
@@ -932,7 +935,7 @@ struct KernelRegistrar {
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -941,18 +944,18 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -961,13 +964,13 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
@@ -977,7 +980,7 @@ struct KernelRegistrar {
  * to avoid being removed by linker
  */
 #define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 2fda3cb6db4fdb4aaac7fc7c88075b833c050bad..2cc82772cf8aa09c2a67d5329dda5adfe01f21bb 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -23,23 +23,22 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
-#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
 
 namespace phi {
 
-#define PT_KERNEL(...) \
+// PD_KERNEL has been used by custom op api
+#define PHI_KERNEL(...) \
   ::phi::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
 
-#define PT_VARIADIC_KERNEL(...)                                      \
+#define PHI_VARIADIC_KERNEL(...)                                     \
   reinterpret_cast<void*>(&::phi::KernelImpl<decltype(&__VA_ARGS__), \
                                              &__VA_ARGS__>::VariadicCompute)
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
   template <typename... Tail>                                                \
   struct KernelCallHelper<const dev_ctx&, Tail...> {                         \
     template <int dev_ctx_idx,                                               \
@@ -62,7 +61,7 @@ namespace phi {
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
   template <typename... Tail>                                           \
   struct KernelCallHelper<const tensor_type&, Tail...> {                \
     template <int dev_ctx_idx,                                          \
@@ -83,7 +82,7 @@ namespace phi {
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
   template <typename... Tail>                                              \
   struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
     template <int dev_ctx_idx,                                             \
@@ -104,29 +103,29 @@ namespace phi {
     }                                                                      \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
-  template <typename... Tail>                                              \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
-    template <int dev_ctx_idx,                                             \
-              int in_idx,                                                  \
-              int attr_idx,                                                \
-              int out_idx,                                                 \
-              typename... PreviousArgs>                                    \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
-      static_assert(attr_idx == 0,                                         \
-                    "Kernel's Input should appear before Attributes.");    \
-      static_assert(out_idx == 0,                                          \
-                    "Kernel's Input should appear before Outputs.");       \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
-      std::vector<tensor_type> arg = std::move(                            \
-          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
-      KernelCallHelper<Tail...>::                                          \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
-              ctx, pargs..., arg);                                         \
-    }                                                                      \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
+  template <typename... Tail>                                                \
+  struct KernelCallHelper<const std::vector<const tensor_type*>&, Tail...> { \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {        \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);           \
+      std::vector<const tensor_type*> arg = std::move(                       \
+          ctx->InputsBetween<tensor_type>(range.first, range.second));       \
+      KernelCallHelper<Tail...>::                                            \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                             \
   struct KernelCallHelper<attr_type, Tail...> {                           \
     template <int dev_ctx_idx,                                            \
@@ -144,7 +143,7 @@ namespace phi {
     }                                                                     \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
   template <typename... Tail>                                            \
   struct KernelCallHelper<tensor_type*, Tail...> {                       \
     template <int dev_ctx_idx,                                           \
@@ -161,7 +160,7 @@ namespace phi {
     }                                                                    \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
   template <typename... Tail>                                                 \
   struct KernelCallHelper<std::vector<tensor_type*>, Tail...> {               \
     template <int dev_ctx_idx,                                                \
@@ -206,68 +205,66 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   /* DeviceContext Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
 #endif
 
   /* Input Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
-#endif
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
 
   /* Attribute Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<Scalar>&);
 
   /* Output Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
-#endif
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 97c5466e1deea891419bd6659faa963dfedab3a5..8049d027a77b808c03636e5ca5a61bc61712652b 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -26,19 +26,19 @@ namespace phi {
   classname& operator=(classname&&) = delete
 #endif
 
-#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
-  _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+#define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
 
-#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
+#define _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
 #ifdef __COUNTER__
-#define PT_ID __COUNTER__
+#define PD_ID __COUNTER__
 #else
-#define PT_ID __LINE__
+#define PD_ID __LINE__
 #endif
 
 #if defined(_WIN32)
@@ -48,9 +48,9 @@ namespace phi {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
-#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
-#define PT_CONCATENATE2(arg1, arg2) arg1##arg2
-#define PT_EXPAND(x) x
+#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
+#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
+#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PD_EXPAND(x) x
 
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 2aadce4feda96623553e8583f926af38458f8f9e..eb114304f53ea08b05d36792330cf5bd3ebbee5d 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -98,13 +98,9 @@ const LoD& MetaTensor::lod() const {
 }
 
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
-  if (phi::DenseTensor::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    set_layout(meta_tensor.layout());
-    share_lod(meta_tensor);
-  } else if (phi::SelectedRows::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
+  if (phi::DenseTensor::classof(tensor_) ||
+      phi::SelectedRows::classof(tensor_)) {
+    share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
     set_layout(meta_tensor.layout());
     share_lod(meta_tensor);
@@ -114,4 +110,29 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
+TensorBase* MetaTensor::get_tensor() const { return tensor_; }
+
+void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
+  bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
+  bool is_selected_rows = phi::SelectedRows::classof(tensor_);
+  if (is_dense_tensor || is_selected_rows) {
+    set_dims(meta_tensor.dims());
+    if (is_selected_rows) {
+      const auto in_tensor_base = meta_tensor.get_tensor();
+      PADDLE_ENFORCE_EQ(
+          phi::SelectedRows::classof(in_tensor_base),
+          true,
+          errors::InvalidArgument("The input MetaTensor is SelectedRows, but "
+                                  "the output MetaTensor is not this type."));
+      auto* selected_rows_out = static_cast<SelectedRows*>(tensor_);
+      auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base);
+      selected_rows_out->set_rows(selected_rows_in->rows());
+      selected_rows_out->set_height(selected_rows_in->height());
+    }
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Unsupported sharing dims for `%s`.", tensor_->type_info().name()));
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 1a32019a190496804c0ef4c64f78f687b8af7577..3971a9f7e99e0282cae5e4d1e61ee6eb28c4b9a7 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -60,12 +60,13 @@ class MetaTensor {
 
   virtual void share_lod(const MetaTensor& meta_tensor);
   virtual void share_meta(const MetaTensor& meta_tensor);
+  virtual void share_dims(const MetaTensor& meta_tensor);
 
  private:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
-
+  TensorBase* get_tensor() const;
   TensorBase* tensor_;
 };
 
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index 0dd5d543414fee444ee28994fcfd78fcdeee9e18..ca3290f33e61eb730d3f17a0b8cc72cbf1c0db58 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
   void* AllocateFrom(Allocator* allocator,
                      DataType dtype,
                      size_t requested_size = 0) override;
+  void set_dims(const DDim& dims) { this->dims_ = dims; }
 
  private:
   // save the indices of non zero elements in original dense tensor
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index 3d2da542c74176017492bdb9f567396f81308d6a..f4bd0be0b45b867b8ed98a5c50d2e3f58ea49780 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/utils/any.h"
 #include "paddle/utils/optional.h"
 
-// Note: mixed_vector include many header now, LoD will be
-// used on CUDA device? Can we use small_vector here?
-// @zhanlve: Rollback to original LoD for now
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-#include "paddle/fluid/framework/mixed_vector.h"
-#endif
-
 namespace phi {
 
 using DDim = phi::DDim;
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 7d403fee94300e9517fcc517f4d088470d772e35..0a2b4dcae58ca07b054e04a5a5f8e7a720591034 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -105,4 +105,61 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
   dx->share_meta(dout);
 }
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad) {
+  const auto& dtype = out_grad.dtype();
+  x_grad->set_dims(x.dims());
+  x_grad->share_lod(x);
+  x_grad->set_dtype(dtype);
+}
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void ScatterGradInferMeta(const MetaTensor& index,
+                          const MetaTensor& updates,
+                          const MetaTensor& out_grad,
+                          bool overwrite,
+                          MetaTensor* x_grad,
+                          MetaTensor* updates_grad) {
+  const auto& dtype = out_grad.dtype();
+  if (updates_grad) {
+    updates_grad->set_dims(updates.dims());
+    updates_grad->set_dtype(dtype);
+  }
+
+  if (x_grad) {
+    x_grad->set_dims(out_grad.dims());
+    x_grad->set_dtype(dtype);
+  }
+}
+
+void ScatterNdAddGradInferMeta(const MetaTensor& index,
+                               const MetaTensor& updates,
+                               const MetaTensor& out_grad,
+                               MetaTensor* x_grad,
+                               MetaTensor* updates_grad) {
+  const auto& dtype = out_grad.dtype();
+  if (updates_grad) {
+    updates_grad->set_dims(updates.dims());
+    updates_grad->set_dtype(dtype);
+  }
+
+  if (x_grad) {
+    x_grad->set_dims(out_grad.dims());
+    x_grad->set_dtype(dtype);
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c7090ed664b286e5a8d2c8e327f3c1ea37a71f04..c4003ca1fe76b865079e8f577fdee9db3be895ab 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <tuple>
 
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
@@ -45,4 +46,28 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
                                 MetaTensor* dx);
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx);
+
+void ScatterGradInferMeta(const MetaTensor& index,
+                          const MetaTensor& updates,
+                          const MetaTensor& out_grad,
+                          bool overwrite,
+                          MetaTensor* x_grad,
+                          MetaTensor* updates_grad);
+
+void ScatterNdAddGradInferMeta(const MetaTensor& index,
+                               const MetaTensor& updates,
+                               const MetaTensor& out_grad,
+                               MetaTensor* x_grad,
+                               MetaTensor* updates_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index dfaabf7cae21ec9b91624211ce9b852148dd7cc2..641956c4d9de796bed166e1f6238ff6988601bec 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -13,11 +13,60 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/binary.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
 
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+
+  if (dim_x == dim_y) {
+    out->share_meta(x);
+  } else {
+    int max_dim = std::max(dim_x.size(), dim_y.size());
+    int axis = std::abs(dim_x.size() - dim_y.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    funcs::GetBroadcastDimsArrays(dim_x,
+                                  dim_y,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
+
+    out->set_dims(make_ddim(out_dims_array));
+    out->share_lod(x);
+  }
+
+  out->set_dtype(DataType::BOOL);
+}
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  PADDLE_ENFORCE_GE(
+      dim_x.size(),
+      dim_y.size(),
+      errors::InvalidArgument(
+          "The size of dim_y should not be greater than dim_x's."));
+  out->share_lod(x);
+  out->set_dims(make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_dims = x.dims();
   auto x_rank = static_cast<size_t>(x_dims.size());
@@ -225,6 +274,154 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input Y must greater or equal to 2"));
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input X must greater or equal to 2"));
+  PADDLE_ENFORCE_EQ(
+      y_dims[y_dims_n - 1],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("input Matrix Y should be square matrix,"
+                                   "But Got last shape of %ld x %ld",
+                                   y_dims[y_dims_n - 1],
+                                   y_dims[y_dims_n - 2]));
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_dims_n - 2],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("the first dim of Matrix X must be equal to "
+                                   "the fisrt dim of Matrix Y,"
+                                   "But Got %ld and %ld",
+                                   x_dims[x_dims_n - 2],
+                                   y_dims[y_dims_n - 2]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_broadcast_dims({expand_batch_portion});
+  x_broadcast_dims.insert(x_broadcast_dims.end(),
+                          {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]});
+
+  // dim of 'out' is the same with 'X' after broadcast
+  out->set_dims(phi::make_ddim(x_broadcast_dims));
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor X's dimensions of TriangularSolveOp "
+                        "should be >= 2. But received X's "
+                        "dimensions = %d, X's shape = [%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor Y's dimensions of TriangularSolveOp "
+                        "should be >=2. But received Y's "
+                        "dimensions = %d, Y's shape = [%s]",
+                        y_dims.size(),
+                        y_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
+                    x_dims[x_dims_n - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        x_dims[x_dims_n - 2],
+                        x_dims[x_dims_n - 1]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+  y_broadcast_dims.insert(y_broadcast_dims.end(),
+                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+
+  // dim of 'out' is the same with 'Y' after broadcast
+  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dtype(y.dtype());
+  out->set_layout(y.layout());
+  out->share_lod(y);
+}
+
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Inputs(X) shape of IndexSample op should be 2-D, but "
+                        "got X's shape = [%s], please check X shape.",
+                        input_dims));
+
+  auto index_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      2,
+      errors::InvalidArgument(
+          "Inputs(Index) shape of IndexSample op should be 2-D, but "
+          "got Index's shape [%s] , please check index shape.",
+          input_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      index_dims[0],
+                      errors::InvalidArgument(
+                          "Inputs(X)'s value of dimension 0 must same with "
+                          "Inputs(Index)'s value of dimension 0, but "
+                          "got %d of Inputs(X), and got %d of Inputs(Index), "
+                          "please check Inputs shape.",
+                          input_dims[0],
+                          index_dims[0]));
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(index_dims);
+  out->share_lod(y);
+}
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
@@ -271,8 +468,26 @@ void CrossInferMeta(const MetaTensor& x,
 }
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  auto in_dims = x.dims();
-  out->set_dims(in_dims);
+  out->share_meta(x);
+}
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config) {
+  auto dims = x.dims();
+  dims[0] = -1;
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (pooltype == "MEAN") {
+    summed_ids->set_dims({-1, 1});
+    summed_ids->set_dtype(x.dtype());
+    summed_ids->set_layout(x.layout());
+  }
 }
 
 void BCELossInferMeta(const MetaTensor& input,
@@ -314,4 +529,236 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out) {
+  auto input_dim = x.dims();
+
+  PADDLE_ENFORCE_GE(minlength,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The minlength should be greater than or equal to 0."
+                        "But received minlength is %d",
+                        minlength));
+
+  PADDLE_ENFORCE_EQ(
+      input_dim.size(),
+      1,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
+                                   "But the dimension of Input(X) is [%d]",
+                                   input_dim.size()));
+
+  if (weights.is_initialized()) {
+    auto weights_dim = weights->dims();
+    PADDLE_ENFORCE_EQ(weights_dim.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The 'shape' of Input(Weights) must be 1-D tensor."
+                          "But the dimension of Input(Weights) is [%d]",
+                          weights_dim.size()));
+
+    PADDLE_ENFORCE_EQ(
+        weights_dim[0],
+        input_dim[0],
+        phi::errors::InvalidArgument(
+            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+            "Input(X)."
+            "But received: the 'shape' of Input(Weights) is [%s],"
+            "the 'shape' of Input(X) is [%s]",
+            weights_dim,
+            input_dim));
+  }
+  out->set_dims(phi::make_ddim({-1}));
+  if (weights.is_initialized()) {
+    out->set_dtype(weights->dtype());
+  } else {
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) has not been initialized properly. The "
+                        "shape of Input(X) = [%s].",
+                        x_dims));
+  PADDLE_ENFORCE_NE(phi::product(y_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) has not been initialized properly. The "
+                        "shape of Input(Y) = [%s].",
+                        y_dims));
+  out->set_dims({1});
+  out->set_dtype(x.dtype());
+}
+
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto x_dims_size = x_dims.size();
+  auto index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      x_dims_size,
+      phi::errors::InvalidArgument(
+          "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1"));
+
+  std::vector<int64_t> result_dims;
+  // The result dims is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    result_dims.emplace_back(index_dims[i]);
+  }
+  for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
+    result_dims.emplace_back(x_dims[i]);
+  }
+
+  out->set_dims(phi::make_ddim(result_dims));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
+
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto pred_dims = input.dims();
+  auto label_dims = label.dims();
+
+  if (config.is_runtime ||
+      (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
+    PADDLE_ENFORCE_EQ(
+        pred_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The dimensions of Input(Predicted) must be equal to the"
+            "dimensions of Input(Labels), but received dimensions of "
+            "Input(Predicted)"
+            "is [%s], received dimensions of Input(Labels) is [%s].",
+            pred_dims,
+            label_dims));
+  }
+  PADDLE_ENFORCE_EQ(pred_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(Predicted) must be 2,"
+                        "But received dimensions of Input(Predicted)"
+                        "is [%d]",
+                        pred_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(pred_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "Each row of Input(Predicted) contains a real value, "
+                          "so the 2nd dimension of Input(X) must be 1,"
+                          "But got [%d]",
+                          pred_dims[1]));
+  }
+  out->set_dims({pred_dims[0], 1});
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_vec = vec.dims();
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      2,
+      phi::errors::InvalidArgument("The rank of input X should be 2, but is %d",
+                                   dim_x.size()));
+  PADDLE_ENFORCE_EQ(
+      dim_vec.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+  PADDLE_ENFORCE_EQ(dim_x[1],
+                    dim_vec[0],
+                    phi::errors::InvalidArgument(
+                        "X's second dimension is expected to be equal to "
+                        "Vec's first dimension"
+                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        dim_x,
+                        dim_vec));
+
+  auto dim_out = phi::make_ddim({dim_x[0]});
+
+  out->set_dims(dim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config) {
+  auto x_dims = x.dims();
+  auto labels_dims = label.dims();
+  int rank = x_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    labels_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        labels_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        phi::slice_ddim(x_dims, 0, rank),
+        phi::slice_ddim(labels_dims, 0, rank),
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Label) shall have the same shape "
+            "except the last dimension. But received: the shape of "
+            "Input(X) is [%s], the shape of Input(Label) is [%s].",
+            x_dims,
+            labels_dims));
+  }
+
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 02750482dccaabd53f360fcc361bfdc8e788b89e..d2b16e557b06dc94107788995f0c26f1e27e1761 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -29,6 +29,15 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out);
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out);
+
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void MatmulInferMeta(const MetaTensor& x,
@@ -53,14 +62,73 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out);
+
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out);
+
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config = MetaConfig());
+
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
                     MetaTensor* out);
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config = MetaConfig());
+
 void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
+
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out);
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out);
+
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out);
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
+
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
+
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7a0db3d5c17ee3cd40891601009a3841f603bb32..84441ed8b740be172ddaa7de3fc23ad420ebf077 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -13,11 +13,193 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
-
+#include <vector>
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
+  std::vector<DDim> dims;
+  dims.reserve(tensors.size());
+  for (const MetaTensor* tensor : tensors) {
+    dims.emplace_back(tensor->dims());
+  }
+  return dims;
+}
+
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config) {
+  auto predict_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_GE(
+      predict_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape size must be "
+          "greater_equal 2.",
+          predict_dims));
+  auto predict_width = predict_dims[1];
+  PADDLE_ENFORCE_NE(
+      phi::product(predict_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape can not involes 0.",
+          predict_dims));
+  PADDLE_ENFORCE_NE(
+      phi::product(label_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Label) has not been initialized properly. The "
+          "shape of Input(Label) = [%s], the shape can not involes 0.",
+          label_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_LE(
+        predict_width,
+        2,
+        phi::errors::InvalidArgument("Only support binary classification,"
+                                     "prediction dims[1] should be 1 or 2"));
+  }
+  auto predict_height = input.dims()[0];
+  auto label_height = label.dims()[0];
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        predict_height,
+        label_height,
+        phi::errors::InvalidArgument("Out and Label should have same height."));
+  }
+
+  int num_pred_buckets = num_thresholds + 1;
+
+  PADDLE_ENFORCE_GE(
+      num_pred_buckets,
+      1,
+      phi::errors::InvalidArgument("num_thresholds must larger than 1"));
+  PADDLE_ENFORCE_GE(
+      slide_steps,
+      0,
+      phi::errors::InvalidArgument("slide_steps must be natural number"));
+
+  auc->set_dims({1});
+  auc->set_dtype(DataType::INT64);
+
+  if (slide_steps) {
+    stat_pos_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_neg_out->set_dtype(DataType::INT64);
+  } else {
+    stat_pos_out->set_dims({1, num_pred_buckets});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({1, num_pred_buckets});
+    stat_neg_out->set_dtype(DataType::INT64);
+  }
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -84,7 +266,61 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out) {
+  int target_rank = 0;
+  const auto& input_dims = GetMetaTensorsDim(x);
+
+  // 1. Find Output rank = max(Inputs rank)
+  for (const auto& input_ddim : input_dims) {
+    target_rank = std::max(target_rank, input_ddim.size());
+  }
+
+  PADDLE_ENFORCE_GT(target_rank,
+                    0,
+                    errors::InvalidArgument("BroadcastTensorsOp requires at "
+                                            "least one input tensor to have "
+                                            "rank greater than zero"));
+
+  std::vector<int64_t> target_dims(target_rank, 0);
+  // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
+  for (int index = 0; index < target_rank; index++) {
+    // Loop axes in reverse order,
+    // For each axis, take the maximum as target size
+    // Fill size = 1 if shape vector exhausts
+    int target_dim_size = 1;
+    for (const auto& input_ddim : input_dims) {
+      // Reversed order
+      int axis = static_cast<int>(input_ddim.size()) - index - 1;
+      int dim_size = 1;
+      if (axis >= 0) {
+        dim_size = input_ddim[axis];
+      }
+
+      if (target_dim_size != 1 && dim_size != 1 &&
+          target_dim_size != dim_size) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "BroadcastTensorsOp inputs does not satisfy bcast semantics, "
+            "please check axis = %d in reverse order",
+            index));
+      }
+
+      // We performed bcast semantics check at python level
+      // So input tensors should all have legal shape
+      target_dim_size = std::max(target_dim_size, dim_size);
+    }
+    target_dims[target_rank - index - 1] = target_dim_size;
+  }
+
+  // 3. Set Output Dim
+  for (size_t i = 0; i < out.size(); i++) {
+    out[i]->set_dims(phi::make_ddim(target_dims));
+    out[i]->share_lod(*(x[i]));
+    out[i]->set_dtype(x[i]->dtype());
+  }
+}
+
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config) {
@@ -93,10 +329,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
                     phi::errors::InvalidArgument(
                         "The size of input meta vector should be greater"
                         "than 0."));
+  if (axis_scalar.FromTensor()) {
+    auto out_dims =
+        phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
+    out->set_dims(out_dims);
+    out->set_dtype(x.at(0)->dtype());
+    out->set_layout(x.at(0)->layout());
+    out->share_lod(*x.at(0));
+    return;
+  }
 
   int axis = axis_scalar.to<int>();
   // 1. calculate axis
-  int rank = x.at(0).dims().size();
+  int rank = x.at(0)->dims().size();
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
@@ -111,15 +356,117 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
 
   // 2. calculate out dims
   std::vector<phi::DDim> x_dims;
-  for (auto& x_t : x) {
-    x_dims.push_back(x_t.dims());
+  x_dims.reserve(x.size());
+  for (const auto* x_t : x) {
+    x_dims.emplace_back(x_t->dims());
   }
   phi::DDim out_dim =
       phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
 
   out->set_dims(out_dim);
-  out->set_dtype(x.at(0).dtype());
-  out->set_layout(x.at(0).layout());
+  out->set_dtype(x.at(0)->dtype());
+  out->set_layout(x.at(0)->layout());
+  out->share_lod(*x.at(0));
+}
+
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out) {
+  auto input_dims = x.dims();
+  auto rois_dims = rois.dims();
+
+  PADDLE_ENFORCE_EQ(
+      input_dims.size(),
+      4,
+      errors::InvalidArgument("The format of input tensor is NCHW"));
+  PADDLE_ENFORCE_EQ(rois_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  PADDLE_ENFORCE_EQ(rois_dims[1],
+                    4,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  if (rois_num.get_ptr()) {
+    auto rois_num_dims = rois_num->dims();
+    PADDLE_ENFORCE_EQ(
+        rois_num_dims.size(),
+        1,
+        errors::InvalidArgument("The second dimension of RoisNum should "
+                                "be 1, but received dimension is %d",
+                                rois_num_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      input_dims[1],
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "the channel of X(%d) "
+          "should be equal to the product of "
+          "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+          input_dims[1],
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output height must be greater than 0"));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output width must be greater than 0"));
+  PADDLE_ENFORCE_GT(output_channels,
+                    1,
+                    errors::InvalidArgument(
+                        "The pooled output channels must greater than 1"));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      errors::InvalidArgument("The spatial scale must greater than 0."));
+
+  auto out_dims = input_dims;
+  out_dims[0] = rois_dims[0];
+  out_dims[1] =
+      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out) {
+  auto cond_dims = condition.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      cond_dims,
+      x_dims,
+      phi::errors::InvalidArgument(
+          "The dims of Inputs(Condition) and Inputs(X) should be same. "
+          "But received Condition's shape is [%s], X's shape is [%s]",
+          cond_dims,
+          x_dims));
+  PADDLE_ENFORCE_EQ(x_dims,
+                    y_dims,
+                    phi::errors::InvalidArgument(
+                        "The dims of Inputs(X) and Inputs(Y) should be same. "
+                        "But received X's shape is [%s], Y's shape is [%s]",
+                        x_dims,
+                        y_dims));
+  out->share_meta(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index a5fb2a4cbddc33b97b31a26fa29293868808875a..c11843212ed33fd8170e6677a4d6e0ad95b730dc 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,43 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
+
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -25,9 +62,26 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out);
+
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out);
+
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 1fdf8a6940a68f1ffaea9803e151983869e2ab7f..506d3fd14ea3fd568ce2f77d7ce30408062279e9 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -32,4 +32,37 @@ void CreateInferMeta(const ScalarArray& shape,
   CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
 }
 
+void EyeInferMeta(int64_t num_rows,
+                  int64_t num_columns,
+                  DataType dtype,
+                  MetaTensor* out) {
+  if (num_columns == -1) num_columns = num_rows;
+  out->set_dims({num_rows, num_columns});
+  out->set_dtype(dtype);
+}
+
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape.GetData());
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index ea5bb71551b4218a897b73f8c2c7a6ef7b139544..bd0567486e4d62a9f6fe9adfa02727bfe79937e1 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -35,4 +35,23 @@ void CreateInferMetaBase(const std::vector<int64_t>& shape,
 
 void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
 
+void EyeInferMeta(int64_t num_rows,
+                  int64_t num_columns,
+                  DataType dtype,
+                  MetaTensor* out);
+
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out);
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 52aeaef8438548542e2ecac4219f6eb2a8e8462b..88ac2cb0f8d1b01ade0e58bc8f1253c67ad05981 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -89,4 +89,386 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config) {
+  auto x_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The tensor rank of Input(X) must be 2 or 4."));
+  bool contain_unknown_dim =
+      phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(label_dims);
+  bool check = config.is_runtime || !contain_unknown_dim;
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        x_dims[0],
+        label_dims[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: Expected input batch_size to match label batch_size,"
+            "But received: the Input(x) batch_size is [%s], the Input(label) "
+            " batch_size is [%s].",
+            x_dims[0],
+            label_dims[0]));
+    if (weight.get_ptr() != nullptr) {
+      auto w_dims = weight->dims();
+      PADDLE_ENFORCE_EQ(
+          w_dims.size(),
+          1,
+          phi::errors::InvalidArgument("Input(Weight) should be a 1D tensor."));
+      PADDLE_ENFORCE_EQ(
+          x_dims[1],
+          w_dims[0],
+          phi::errors::InvalidArgument(
+              "Expected input tensor Weight's size should equal "
+              "to the first dimension of the input tensor X. But received "
+              "Weight's "
+              "size is %d, the first dimension of input X is %d",
+              w_dims[0],
+              x_dims[1]));
+    }
+  }
+  if (x_dims.size() == 2) {
+    if (reduction == "none") {
+      out->set_dims({x_dims[0]});
+    } else {
+      out->set_dims({1});
+    }
+  } else if (x_dims.size() == 4) {
+    PADDLE_ENFORCE_EQ(label_dims.size(),
+                      3,
+                      phi::errors::InvalidArgument(
+                          "Expected Input(Lable) dimensions=3, received %d.",
+                          label_dims.size()));
+    auto input0 = x_dims[0];
+    auto input2 = x_dims[2];
+    auto input3 = x_dims[3];
+    auto label0 = label_dims[0];
+    auto label1 = label_dims[1];
+    auto label2 = label_dims[2];
+    PADDLE_ENFORCE_EQ(
+        input0 == label0 && input2 == label1 && input3 == label2,
+        true,
+        phi::errors::InvalidArgument("Input(X) tensor shape should "
+                                     "match to Input(Label) tensor "
+                                     "shape."));
+    if (reduction == "none") {
+      out->set_dims({x_dims[0], x_dims[2], x_dims[3]});
+    } else {
+      out->set_dims({1});
+    }
+  }
+  total_weight->set_dims({1});
+  out->set_dtype(input.dtype());
+  total_weight->set_dtype(input.dtype());
+}
+
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out) {
+  const auto& updates_dims = updates.dims();
+  const auto& ref_dims = x.dims();
+  const auto& index_dims = index.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The size of Input(Ids)'s shape should be equal to 1, but "
+          "received the rank of Input(Ids) is %d.",
+          index_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      ref_dims.size(),
+      updates_dims.size(),
+      phi::errors::InvalidArgument(
+          "Input(X) and Input(Updates) should have the same shape size, "
+          "but received the size of Input(x)'s shape is %d, the size of "
+          "Input(Updates)'s shape is %d.",
+          ref_dims.size(),
+          updates_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      updates_dims[0],
+      index_dims[0],
+      phi::errors::InvalidArgument(
+          "Input(Updates) and Input(Ids) should have same batch-size, but"
+          " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+          "batch-size is %d.",
+          updates_dims[0],
+          index_dims[0]));
+  out->set_dims(ref_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void ScatterNdAddInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& updates,
+                           MetaTensor* out) {
+  const auto& ref_dims = x.dims();
+  auto ref_dims_size = ref_dims.size();
+  const auto& index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+  const auto& updates_dims = updates.dims();
+  auto updates_dims_size = updates_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      ref_dims_size,
+      phi::errors::InvalidArgument(
+          "The last dimension of Input(Index)'s shape should be no greater "
+          "than the rank of Input(X), but received the last dimension of "
+          "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+          index_dims[index_dims_size - 1],
+          ref_dims_size));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    2UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1, "
+                        "but received the rank of Input(Index) is %d.",
+                        index_dims_size));
+
+  // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
+  std::vector<int64_t> r_updates_dims;
+  for (int64_t i = 0; i < index_dims_size - 1; ++i) {
+    r_updates_dims.emplace_back(index_dims[i]);
+  }
+  for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
+    r_updates_dims.emplace_back(ref_dims[i]);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      r_updates_dims.size(),
+      updates_dims_size,
+      phi::errors::InvalidArgument(
+          "Updates has wrong shape. The shape of Updates and Input(Updates) "
+          "should be same, but received the shape of Updates is %d, "
+          "the shape of Input(Updates) is %d.",
+          r_updates_dims.size(),
+          updates_dims_size));
+
+  for (int64_t i = 0; i < updates_dims_size; ++i) {
+    PADDLE_ENFORCE_EQ(
+        r_updates_dims[i],
+        updates_dims[i],
+        phi::errors::InvalidArgument(
+            "Updates has wrong shape. The dimensions of Updates and "
+            "Input(Updates) should match, but received Updates's"
+            "%d-th dimension is %d, Input(Updates)'s %d-th "
+            "dimension is %d.",
+            i,
+            r_updates_dims[i],
+            i,
+            updates_dims[i]));
+  }
+  out->set_dims(ref_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input in ViterbiDecode  must be 3. But "
+                        "received Input's rank is %d.",
+                        in_dims.size()));
+  auto length_dims = length.dims();
+  PADDLE_ENFORCE_EQ(length_dims.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The rank of Length in ViterbiDecode must be 1. But "
+                        "received Length's rank is %d.",
+                        length_dims.size()));
+  auto transition_dims = transition.dims();
+  PADDLE_ENFORCE_EQ(
+      transition_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Transition in ViterbiDecode must be 2. But "
+          "received Transition's rank is %d.",
+          transition_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        in_dims[0],
+        length_dims[0],
+        phi::errors::InvalidArgument(
+            "The batch size of Input and Length should be equal."));
+    PADDLE_ENFORCE_EQ(in_dims[2],
+                      transition_dims[0],
+                      phi::errors::InvalidArgument(
+                          "The number of tags of Input (%d) and Transition "
+                          "(%d) should be equal.",
+                          transition_dims[0],
+                          in_dims[2]));
+  }
+  scores->set_dims(length_dims);
+  scores->set_dtype(length.dtype());
+}
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto w_dims = weight.dims();
+  DDim out_dims;
+  out_dims = funcs::GetOutputDims(x_dims, y_dims);
+  if (w_dims.size() > 1 || w_dims[0] != 1) {
+    out_dims = funcs::GetOutputDims(out_dims, w_dims);
+  }
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto step_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (step_dims.size() == 1) && (step_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   step_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config) {
+  auto inference_dim = out.dims();
+  auto label_dim = label.dims();
+  // Assume indices has same shape as inference, because
+  // it's the output of topk.
+  PADDLE_ENFORCE_EQ(
+      label_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: label's dimensions of AccuracyOp must be 2. "
+          "But received label's dimensions = %d, label's shape = [%s]",
+          label_dim.size(),
+          label_dim));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(label_dim[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: label's second dimension of "
+                          "AccuracyOp must be 1. But received label's "
+                          "second dimension is = %d, label's shape = [%s]",
+                          label_dim[1],
+                          label_dim));
+    PADDLE_ENFORCE_EQ(
+        inference_dim[0],
+        label_dim[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: the output's num_rows of AccuracyOp must be"
+            " the same as label's num_rows. But received output's "
+            "shape = [%s], label's shape = [%s], output's num_rows = %d, "
+            "label's "
+            "num_rows = %d",
+            inference_dim,
+            label_dim,
+            inference_dim[0],
+            label_dim[0]));
+  }
+
+  accuracy->set_dims({1});
+  accuracy->set_dtype(out.dtype());
+  correct->set_dims({1});
+  correct->set_dtype(out.dtype());
+  total->set_dims({1});
+  total->set_dtype(out.dtype());
+  accuracy->share_lod(out);
+}
+
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count) {
+  auto src_index_dims = src_index.dims();
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  auto dims = x.dims();
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+
+  if (pool_type == "MEAN") {
+    dst_count->set_dims({dims[0]});
+    dst_count->set_dtype(DataType::INT32);
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index d6223dd87aaf8e8c20c00ad72523e160ee15faee..c9a7e78db752f95c7e38857e3f1075a0d672246b 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -29,6 +29,14 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config = MetaConfig());
 
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
@@ -37,4 +45,53 @@ void AddmmInferMeta(const MetaTensor& input,
                     float beta,
                     MetaTensor* out);
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad);
+
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out);
+
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config = MetaConfig());
+
+void ScatterNdAddInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& updates,
+                           MetaTensor* out);
+
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config = MetaConfig());
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out);
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out);
+
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 49fd0a343a470f2545fc563366256f4f92294297..9daad7d6aaa9f5af70b4b7c3b4bfa96bc351194b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -16,13 +16,44 @@ limitations under the License. */
 
 #include <algorithm>
 #include <set>
+
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 
 namespace phi {
 
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices) {
+  auto in_dims = input.dims();
+  auto num_dims = in_dims.size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -num_dims,
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -num_dims(%d).",
+                                   axis,
+                                   -num_dims));
+  PADDLE_ENFORCE_LT(
+      axis,
+      num_dims,
+      phi::errors::InvalidArgument(
+          "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
+
+  output->share_dims(input);
+  output->set_dtype(input.dtype());
+  indices->share_dims(input);
+  indices->set_dtype(DataType::INT64);
+  output->share_lod(input);
+  indices->share_lod(input);
+}
+
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->share_meta(x);
 }
@@ -51,6 +82,12 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
   out->share_meta(x);
 }
 
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_layout(x.layout());
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
@@ -148,6 +185,24 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out) {
+  auto x_dims = x.dims();
+  if (flatten) {
+    out->set_dims(phi::make_ddim({phi::product(x_dims)}));
+    out->set_dtype(x.dtype());
+  } else {
+    out->set_dims(x_dims);
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       product(x.dims()),
@@ -300,6 +355,11 @@ void InferMetaFromVecValue(const MetaTensor& x,
   }
 }
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -335,6 +395,74 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config) {
+#define MAX_RANK_SUPPORTED 6
+
+  auto repeat_times_data = repeat_times.GetData();
+  auto x_dims = x.dims();
+  if (repeat_times_data.size() == 0) {
+    repeat_times_data = std::vector<int64_t>(x_dims.size(), -1);
+  }
+
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      repeat_times_data.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          repeat_times_data.size()));
+  PADDLE_ENFORCE_GE(
+      repeat_times_data.size(),
+      1,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must be positive integers, but the value received is %d.",
+          repeat_times_data.size()));
+
+  auto out_rank =
+      std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
+  std::vector<int64_t> out_shape(out_rank);
+  auto x_dim_vec = phi::vectorize<int>(x_dims);
+  if (x_dim_vec.size() > repeat_times_data.size()) {
+    auto diff = x_dim_vec.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, -1);
+  } else {
+    auto diff = repeat_times_data.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+  }
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) {
+      out_shape[i] = -1;
+    } else {
+      PADDLE_ENFORCE_GT(
+          repeat_times_data[i],
+          0,
+          errors::InvalidArgument(
+              "Every element of the input 'repeat_times' for tile op must be "
+              "greater than 0, but the value given is %d.",
+              repeat_times_data[i]));
+      out_shape[i] = x_dim_vec[i] * repeat_times_data[i];
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_shape));
+  if (out_shape[0] == x_dims[0]) {
+    out->share_lod(x);
+  }
+}
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
@@ -375,7 +503,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
   ReshapeInferMeta(x, shape, out, config);
 }
 
-/*  Why not use ReduceInferMeta directly?
+/*  Why not use SumRawInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
 void SumInferMeta(const MetaTensor& x,
@@ -383,22 +511,51 @@ void SumInferMeta(const MetaTensor& x,
                   DataType dtype,
                   bool keep_dim,
                   MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, dtype, out);
+  bool reduce_all = false;
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
 }
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         DataType dtype,
-                         MetaTensor* out) {
-  bool reduce_all = true;
-  std::set<int64_t> dims_set(axis.begin(), axis.end());
+DDim ReduceInferDim(const MetaTensor& x,
+                    const std::vector<int64_t>& axis,
+                    bool keep_dim,
+                    bool reduce_all) {
+  auto x_rank = x.dims().size();
+
+  std::vector<int64_t> formated_axis = axis;
+  for (size_t i = 0; i < axis.size(); ++i) {
+    PADDLE_ENFORCE_LT(axis[i],
+                      x_rank,
+                      errors::InvalidArgument(
+                          "The reduce dim index %d should be in the "
+                          "range [-dimension(X), dimension(X)] "
+                          "which dimesion = %d. But received dim index = %d.",
+                          i,
+                          x_rank,
+                          axis[i]));
+    PADDLE_ENFORCE_GE(axis[i],
+                      -x_rank,
+                      errors::InvalidArgument(
+                          "The reduce dim index %d should be in the "
+                          "range [-dimension(X), dimension(X)] "
+                          "which dimesion = %d. But received dim index = %d.",
+                          i,
+                          x_rank,
+                          axis[i]));
+
+    if (axis[i] < 0) {
+      formated_axis[i] = axis[i] + x_rank;
+    }
+  }
+
+  bool full_dim = true;
+  std::set<int64_t> dims_set(formated_axis.begin(), formated_axis.end());
   for (int64_t i = 0; i < x.dims().size(); ++i) {
     if (dims_set.find(i) == dims_set.end()) {
-      reduce_all = false;
+      full_dim = false;
       break;
     }
   }
+  reduce_all = reduce_all || full_dim;
 
   std::vector<int64_t> out_dim_vector;
   if (keep_dim) {
@@ -424,6 +581,17 @@ void ReduceInferMetaBase(const MetaTensor& x,
   }
   DDim out_dim = phi::make_ddim(out_dim_vector);
 
+  return out_dim;
+}
+
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+
   DataType out_dtype;
   if (dtype != DataType::UNDEFINED) {
     out_dtype = dtype;
@@ -441,11 +609,23 @@ void ReduceInferMetaBase(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+  out->set_dims(out_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
 void ReduceInferMeta(const MetaTensor& x,
                      const std::vector<int64_t>& axis,
                      bool keep_dim,
                      MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, DataType::UNDEFINED, out);
+  bool reduce_all = false;
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
 }
 
 void TransferLayoutInferMeta(const MetaTensor& x,
@@ -459,7 +639,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x,
                     const ScalarArray& num_or_sections,
                     const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                     MetaConfig config) {
   int axis_value = axis.to<int>();
   int rank = x.dims().size();
@@ -575,14 +755,14 @@ void SplitInferMeta(const MetaTensor& x,
   for (size_t i = 0; i < sections.size(); ++i) {
     if (axis_value != 0) {
       // Only pass LoD when not spliting along the first dim.
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
+      out[i]->set_dtype(x.dtype());
+      out[i]->set_dims(out_dims[i]);
+      out[i]->set_layout(x.layout());
     } else {
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
-      (*out)[i].share_lod(x);
+      out[i]->set_dtype(x.dtype());
+      out[i]->set_dims(out_dims[i]);
+      out[i]->set_layout(x.layout());
+      out[i]->share_lod(x);
     }
   }
 }
@@ -657,6 +837,82 @@ void TraceInferMeta(
     sizes.erase(sizes.begin() + std::min(dim1_, dim2_));
   }
   out->set_dims(phi::make_ddim(sizes));
+  out->set_dtype(x.dtype());
+}
+
+void DiagonalInferMeta(const MetaTensor& input,
+                       int offset,
+                       int axis1,
+                       int axis2,
+                       MetaTensor* out) {
+  auto x_dims = input.dims();
+  int offset_ = offset;
+  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
+  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::OutOfRange("Input's dim is out of range (expected at "
+                              "least 2 dimensions, but got %ld).",
+                              x_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis1_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis1) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis1));
+  PADDLE_ENFORCE_LT(
+      axis2_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis2) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis2));
+  PADDLE_ENFORCE_NE(
+      axis1_,
+      axis2_,
+      phi::errors::InvalidArgument("The dimensions should not be identical "
+                                   "%d vs %d.",
+                                   axis1,
+                                   axis2));
+
+  auto out_dims = vectorize(x_dims);
+  // from out_dims get the dim size of axis1_.
+  auto axis1_size = out_dims[axis1_];
+  auto axis2_size = out_dims[axis2_];
+  // delete two dims by attr axis1 and axis2 from out_dims.
+  /* example:
+     out_dim = [2, 3, 4];
+     axis1 = 0;
+     axis2 = 1;
+     according to the attr of axis1 and axis2, we get:
+     out_dim = [4].
+  */
+  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
+  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+
+  if (offset_ == 0) {
+    out_dims.push_back(std::min(axis1_size, axis2_size));
+  } else if (offset_ > 0) {
+    if ((axis2_size - offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
+    } else {
+      out_dims.push_back(0);
+    }
+  } else {
+    if ((axis1_size + offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
+    } else {
+      out_dims.push_back(0);
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
 }
 
 void UnfoldInferMeta(const MetaTensor& x,
@@ -856,11 +1112,133 @@ void DiagInferMeta(const MetaTensor& x,
   }
 }
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  const auto& x_dims = x.dims();
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -x_dims.size(),
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -Rank(X)(%d).",
+                                   axis,
+                                   -x_dims.size()));
+  PADDLE_ENFORCE_LT(axis,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+                        axis,
+                        x_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      (dtype < 0 || dtype == 2 || dtype == 3),
+      true,
+      phi::errors::InvalidArgument(
+          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+          "received [%s]",
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT32),
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT64),
+          paddle::framework::DataTypeToString(
+              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
+
+  auto x_rank = x_dims.size();
+  if (axis < 0) axis += x_rank;
+  if (config.is_runtime) {
+    if (dtype == paddle::framework::proto::VarType::INT32) {
+      int64_t all_element_num = 0;
+      if (flatten) {
+        all_element_num = phi::product(x_dims);
+
+      } else {
+        all_element_num = x_dims[axis];
+      }
+      PADDLE_ENFORCE_LE(
+          all_element_num,
+          INT_MAX,
+          phi::errors::InvalidArgument(
+              "The element num of the argmin/argmax input at axis is "
+              "%d, is larger than int32 maximum value:%d, you must "
+              "set the dtype of argmin/argmax to 'int64'.",
+              all_element_num,
+              INT_MAX));
+    }
+  }
+  std::vector<int64_t> vec;
+  if (flatten) {
+    vec.emplace_back(static_cast<int64_t>(1));
+  } else {
+    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
+  }
+  out->set_dims(phi::make_ddim(vec));
+  if (dtype == 2) {
+    out->set_dtype(DataType::INT32);
+  } else if (dtype == 3) {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
   out->set_dtype(DataType::INT64);
   out->set_dims({1});
 }
 
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config) {
+  auto x_dim = input.dims();
+  PADDLE_ENFORCE_EQ(
+      static_cast<int>(paddings.size()),
+      x_dim.size() * 2,
+      phi::errors::InvalidArgument(
+          "Size of 'paddings' dimension should be equal to 2 * size of "
+          "Input(X)'s dimension, but received (size of 'paddings' dimension "
+          "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
+          static_cast<int>(paddings.size()),
+          x_dim.size() * 2));
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_GE(paddings[i],
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The element of 'paddings' should >= 0, but "
+                          "received %d for index %d.",
+                          paddings[i],
+                          static_cast<int>(i)));
+  }
+  std::vector<int64_t> out_dims(x_dim.size());
+  for (int i = 0; i < x_dim.size(); ++i) {
+    if ((!config.is_runtime) && (x_dim[i] == -1)) {
+      out_dims[i] = -1;
+    } else {
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+  if (out_dims[0] == x_dim[0]) {
+    // Only pass LoD when the first dimension is equal between
+    // output and input.
+    out->share_lod(input);
+  }
+  out->set_dtype(input.dtype());
+}
+
+void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
@@ -907,6 +1285,130 @@ void PixelShuffleInferMeta(const MetaTensor& x,
   out->set_dims(output_dims);
 }
 
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out) {
+  auto x_dims = x.dims();
+  size_t x_rank = x_dims.size();
+  size_t axis_size = axis.size();
+
+  PADDLE_ENFORCE_EQ(
+      x_rank,
+      axis_size,
+      errors::InvalidArgument("The input tensor's dimension "
+                              "should be equal to the axis's size. "
+                              "But received input tensor's dimension is %d, "
+                              "axis's size is %d",
+                              x_rank,
+                              axis_size));
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_ENFORCE_GE(
+        axis[i],
+        0,
+        errors::InvalidArgument("The axis should be greater than or equal to 0."
+                                "But received %d of axis[%d]",
+                                axis[i],
+                                i));
+
+    PADDLE_ENFORCE_EQ(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        true,
+        errors::InvalidArgument(
+            "Each element of Attribute axis should "
+            "be a unique value range from 0 to (dims - 1), "
+            "where the dims is the axis's size, "
+            "unique value means this axis value can appear only once. "
+            "But received axis[%d] is %d, axis_size is %d, "
+            "count[axis[%d]] is %d",
+            i,
+            axis[i],
+            axis_size,
+            i,
+            count[axis[i]]));
+  }
+
+  phi::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; ++i) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v) {
+  auto input_dim = x.dims();
+  auto rank = input_dim.size();
+
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions."
+                        "But received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      input_dim[rank - 2],
+      input_dim[rank - 1],
+      phi::errors::InvalidArgument(
+          "Eigh op is designed for square matrix, consequently"
+          "inner-most 2 dimensions of Input(X) should be symmetric."
+          "But received X's shape[-2] = %d and shape[-1] = %d.",
+          input_dim[rank - 2],
+          input_dim[rank - 1]));
+
+  std::vector<int64_t> values_dim;
+
+  for (auto i = 0; i < rank - 1; i++) {
+    values_dim.emplace_back(input_dim[i]);
+  }
+  out_w->set_dims(phi::make_ddim(values_dim));
+  out_v->set_dims(input_dim);
+}
+
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
+  auto rank = condition.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1UL,
+      phi::errors::InvalidArgument(
+          "Input(Condition) should have number of dimension at least 1"));
+  out->set_dims(phi::make_ddim({-1, rank}));
+  out->set_dtype(DataType::INT64);
+}
+
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  auto x_dims = in.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, "
+                                   "but the value given is %d.",
+                                   x_dims.size()));
+  if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) {
+    PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1],
+                      1U,
+                      phi::errors::InvalidArgument(
+                          "The last dimension of Input(X) should be 1, "
+                          "but the value given is %d.",
+                          x_dims[x_dims.size() - 1]));
+  }
+
+  out->set_dims(x_dims);
+  out->share_lod(in);
+  out->set_dtype(in.dtype());
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 4fab1ec68ec1e71af5e55a9852cd68deccc09a7c..e8be73e943e09c9794376945cc904fe6f2a3d324 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -32,6 +32,12 @@ class MetaConfig;
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
 
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices);
+
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
 
 // meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
@@ -39,6 +45,8 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
                                  int axis,
                                  MetaTensor* out);
 
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
@@ -61,12 +69,21 @@ void CopyToInferMeta(const MetaTensor& x,
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out);
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
 void InferMetaFromVecValue(const MetaTensor& x,
                            const std::vector<int64_t>& shape,
                            MetaTensor* out);
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -83,10 +100,22 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out);
+
 void ReduceInferMetaBase(const MetaTensor& x,
                          const std::vector<int64_t>& axis,
                          bool keep_dim,
-                         DataType dtype,
+                         bool reduce_all,
                          MetaTensor* out);
 
 void ReduceInferMeta(const MetaTensor& x,
@@ -107,7 +136,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x_meta,
                     const ScalarArray& num_or_sections,
                     const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                     MetaConfig config = MetaConfig());
 
 void UnbindInferMeta(const MetaTensor& x,
@@ -129,11 +158,49 @@ void DiagInferMeta(const MetaTensor& x,
                    float padding_value,
                    MetaTensor* out);
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
+
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
 
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config = MetaConfig());
+
+void DiagonalInferMeta(
+    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
                            MetaTensor* out);
 
+void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out);
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v);
+
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
+
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ef51d6daf6a0052f39c2cf6253c208412cbb6904..093cb6549797d198ccaaff533357243a51188a74 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -9,22 +9,43 @@ add_subdirectory(funcs)
 # phi depends all phi kernel targets
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
+# [ 1. Common kernel compilation dependencies ]
+set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
-# NOTE: Some kernels depend on some targets that are not commonly used.
+# [ 2. Kernels that most kernels depend on ]
+# There are a few kernels that are very basic operations, and most of the
+# kernels depend on these kernels.
+set(COMMON_BAISC_KERNELS empty_kernel full_kernel)
+kernel_library(empty_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
+
+# [ 3. Kernels with special dependencies ]
+# Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
 
-# auto parse and build kernel targets by cmake
-register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
+# 4. auto parse and build kernel targets by cmake
+register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
 
 # phi sparse kernels
 add_subdirectory(sparse)
diff --git a/paddle/phi/kernels/accuracy_kernel.h b/paddle/phi/kernels/accuracy_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f2dbb96f86544882c3218a937225dd27978c15f
--- /dev/null
+++ b/paddle/phi/kernels/accuracy_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total);
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f34e5710ab7294425bacba4e5d5782859ac5f081
--- /dev/null
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+  template <typename T, typename Context>         \
+  void name##GradKernel(const Context& dev_ctx,   \
+                        const DenseTensor& x,     \
+                        const DenseTensor& dout,  \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+  template <typename T, typename Context>           \
+  void name##GradKernel(const Context& dev_ctx,     \
+                        const DenseTensor& out,     \
+                        const DenseTensor& dout,    \
+                        DenseTensor* dx);
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdf8f4363598f8c25e6f128b3f38f13f23005828
--- /dev/null
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_KERNEL(name)   \
+  template <typename T, typename Context> \
+  void name##Kernel(                      \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+DECLARE_ACTIVATION_KERNEL(Cos)
+DECLARE_ACTIVATION_KERNEL(Tan)
+DECLARE_ACTIVATION_KERNEL(Acos)
+DECLARE_ACTIVATION_KERNEL(Sin)
+DECLARE_ACTIVATION_KERNEL(Asin)
+DECLARE_ACTIVATION_KERNEL(Atan)
+DECLARE_ACTIVATION_KERNEL(Sinh)
+DECLARE_ACTIVATION_KERNEL(Cosh)
+DECLARE_ACTIVATION_KERNEL(Asinh)
+DECLARE_ACTIVATION_KERNEL(Acosh)
+DECLARE_ACTIVATION_KERNEL(Atanh)
+DECLARE_ACTIVATION_KERNEL(Relu)
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..65a6aad415193be62424a6c6ac19c1aec6927e8b
--- /dev/null
+++ b/paddle/phi/kernels/adadelta_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adamax_kernel.h b/paddle/phi/kernels/adamax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..feaf996f16266abbada655907fee68f4ab25bad3
--- /dev/null
+++ b/paddle/phi/kernels/adamax_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/arg_min_max_kernel.h b/paddle/phi/kernels/arg_min_max_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..917babeef07e99c9cdd6ab71e772a4e0cb1f9e12
--- /dev/null
+++ b/paddle/phi/kernels/arg_min_max_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_grad_kernel.h b/paddle/phi/kernels/argsort_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b91bd69911351dcd330325fabb515df0eebba29f
--- /dev/null
+++ b/paddle/phi/kernels/argsort_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_kernel.h b/paddle/phi/kernels/argsort_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..683e8631d2e3423f7fe81e10d82dbf701def5338
--- /dev/null
+++ b/paddle/phi/kernels/argsort_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..acbd17c7801e240e933798773d54a50822dec68f
--- /dev/null
+++ b/paddle/phi/kernels/auc_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AucKernel(const Context& dev_ctx,
+               const DenseTensor& input,
+               const DenseTensor& label,
+               const DenseTensor& stat_pos,
+               const DenseTensor& stat_neg,
+               const std::string& curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor* auc,
+               DenseTensor* stat_pos_out,
+               DenseTensor* stat_neg_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c15dbd2f63f588e0bb20ae41a08146e3ac781458
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& dev_ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ddf32e27c7d73a7249d92f7835afdf6b8f3ed5a
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_kernel.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ba69d365480f7c192836137ab22d1d2eb85ea03
--- /dev/null
+++ b/paddle/phi/kernels/bincount_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bitwise_kernel.h b/paddle/phi/kernels/bitwise_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..17307004f360e10ada708fba276ec8de1a129259
--- /dev/null
+++ b/paddle/phi/kernels/bitwise_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BitwiseAndKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseOrKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseXorKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d24f6684a48f2fb4a65673e0bb888a9f37b1246
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<const DenseTensor*>& dout,
+                                std::vector<DenseTensor*> dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..22b5201b6900dd18b645c2d7645adb96a6f11e91
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<const DenseTensor*>& x,
+                            std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cast_kernel.h b/paddle/phi/kernels/cast_kernel.h
index c760b2842d0c97f0afd848ee3dcc333517349c9e..5e07388f5fb20d3a791bcf288e1e6597479e12c5 100644
--- a/paddle/phi/kernels/cast_kernel.h
+++ b/paddle/phi/kernels/cast_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Cast(const Context& dev_ctx,
                  const DenseTensor& x,
                  DataType out_dtype) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   CastInferMeta(x, out_dtype, &meta_out);
   CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
diff --git a/paddle/phi/kernels/cholesky_solve_grad_kernel.h b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2ce67abae6234d1bfbc919cef06250bcd7c6b6e
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_solve_kernel.h b/paddle/phi/kernels/cholesky_solve_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b304a20e611d1d417e25dccc946a7992f911155b
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/compare_kernel.h b/paddle/phi/kernels/compare_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b6b8cd868f9fcb9048e00526b272e3cd4c54682
--- /dev/null
+++ b/paddle/phi/kernels/compare_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DECALRE_COMPARE_KERNEL(compare_kernel) \
+  template <typename T, typename Context>      \
+  void compare_kernel(const Context& ctx,      \
+                      const DenseTensor& x,    \
+                      const DenseTensor& y,    \
+                      int axis,                \
+                      DenseTensor* out);
+
+DECALRE_COMPARE_KERNEL(LessThanKernel)
+DECALRE_COMPARE_KERNEL(LessEqualKernel)
+DECALRE_COMPARE_KERNEL(GreaterThanKernel)
+DECALRE_COMPARE_KERNEL(GreaterEqualKernel)
+DECALRE_COMPARE_KERNEL(EqualKernel)
+DECALRE_COMPARE_KERNEL(NotEqualKernel)
+#undef DECALRE_COMPARE_KERNEL
+
+#define DECALRE_COMPARE_ALL_KERNEL(compare_all_kernel) \
+  template <typename T, typename Context>              \
+  void compare_all_kernel(const Context& ctx,          \
+                          const DenseTensor& x,        \
+                          const DenseTensor& y,        \
+                          DenseTensor* out);
+
+DECALRE_COMPARE_ALL_KERNEL(EqualAll)
+#undef DECALRE_COMPARE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index 505d4d374424141ad71da863d1fd7a6424fb35ef..be13e2826ea81455fd811143dde02f2d11cfdae2 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 44bfae9820aa84cb33784f108ace6aa0ab8b5281..07f93f9b926f174c374bbc20b7c655a65732423f 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -24,6 +24,12 @@ namespace phi {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
 // If T is complex
 template <
     typename T,
@@ -32,7 +38,7 @@ template <
                          std::is_same<T, phi::dtype::complex<double>>::value,
                      bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ConjKernel<T>(dev_ctx, x, &dense_out);
@@ -50,14 +56,56 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename DeviceContext>
-void RealKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  RealKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  ImagKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
 
-template <typename T, typename DeviceContext>
-void ImagKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index fbc4a86f5af611df3bd6b8f5101a3a2f26473c9d..4e72159aeca671614ccfe483ec1496f70e6b1d6a 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -22,22 +22,25 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis,
                   DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Concat(const Context& dev_ctx,
-                   const std::vector<DenseTensor>& x,
+                   const std::vector<const DenseTensor*>& x,
                    const Scalar& axis) {
   std::vector<MetaTensor> meta_x;
-  for (const auto& t : x) {
-    meta_x.emplace_back(t);
+  meta_x.reserve(x.size());
+  std::vector<MetaTensor*> meta_x_ptr;
+  for (const auto* t : x) {
+    meta_x.emplace_back(*t);
+    meta_x_ptr.push_back(&meta_x.back());
   }
 
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..339f1c00eaa505cdcf8976abae92a6c93cfd50eb
--- /dev/null
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& paddding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& dev_ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::string& paddding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bad30989ac90d8b46fee99039c2772d00c7d939a
--- /dev/null
+++ b/paddle/phi/kernels/conv_grad_kernel.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings,
+                    const std::string& paddding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb0bfdd0275b5050054c620e722b0e7653fd678a
--- /dev/null
+++ b/paddle/phi/kernels/conv_kernel.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& paddding_algorithm,
+                int groups,
+                const std::vector<int>& dilations,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& paddding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index efe7d090405df72ce07b2b2bb7f045977d982eff..9f89fc27a7167dac575bdd5a8d1e7b60f3510d2b 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -25,9 +25,9 @@ template <typename T, typename Context>
 void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  ctx.template Alloc<phi::funcs::Real<T>>(
-      out, size_t(x.numel() * sizeof(phi::funcs::Real<T>)));
-  auto* out_data = out->data<phi::funcs::Real<T>>();
+  ctx.template Alloc<phi::dtype::Real<T>>(
+      out, size_t(x.numel() * sizeof(phi::dtype::Real<T>)));
+  auto* out_data = out->data<phi::dtype::Real<T>>();
 
   phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c57ec69b73a230df48411f4074935e2bb4bce461
--- /dev/null
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  size_t num_samples = inference.dims()[0];
+  size_t class_dim = inference.dims()[1];
+  *accuracy_data = 0.0f;
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  int num_correct = 0;
+  // assume inference is already the topk of the output
+  for (size_t i = 0; i < num_samples; ++i) {
+    PADDLE_ENFORCE_GE(
+        label_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "label of AccuracyOp must >= 0, But received label[%d] is %d",
+            i,
+            label_data[i]));
+    for (size_t j = 0; j < class_dim; ++j) {
+      if (indices_data[i * class_dim + j] == label_data[i]) {
+        ++num_correct;
+        break;
+      }
+    }
+  }
+
+  *correct_data = num_correct;
+  *total_data = num_samples;
+  *accuracy_data =
+      static_cast<float>(num_correct) / static_cast<float>(num_samples);
+}
+}  // namespace phi
+
+// TODO(add supported dtype.)
+PD_REGISTER_KERNEL(
+    accuracy, CPU, ALL_LAYOUT, phi::AccuracyRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe43ebb816077432ca4e7f678be4591e5d31b6f7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class functor;                                          \
+    ActivationGradImpl<T, Context, functor_class>(                  \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class functor;                                            \
+    ActivationGradImpl<T, Context, functor_class>(                    \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51883f25183af7c8013bbfb403404397c8492988
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
+  template <typename T, typename Context>                                \
+  void name##Kernel(                                                     \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
+    functor_class functor;                                               \
+    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+
+}  // namespace phi
+PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
+PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
+PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
+PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
+PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
+PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
+PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
+PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
+PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
+PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
+PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
+PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/adadelta_kernel.cc b/paddle/phi/kernels/cpu/adadelta_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b5397b616d7208969636d2f3114ecc46611d7b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adadelta_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, CPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/adamax_kernel.cc b/paddle/phi/kernels/cpu/adamax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..867c900e70b687995930de697e1a9ee4c426e255
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adamax_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, CPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4ad830e149321f417392f6e83bbc8cc06ad3876
--- /dev/null
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename Context,
+          typename T,
+          typename Tout,
+          int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)  \
+  template <typename Context, typename T, typename Tout, int64_t Rank>    \
+  struct ArgMinMaxFunctor<Context, T, Tout, Rank, enum_argminmax_value> { \
+    void operator()(const Context& dev_ctx,                               \
+                    const DenseTensor& in,                                \
+                    DenseTensor* out,                                     \
+                    phi::DDim x_dims,                                     \
+                    int64_t axis,                                         \
+                    bool keepdims) {                                      \
+      auto in_eigen = EigenTensor<T, Rank>::From(in, x_dims);             \
+      if (keepdims) {                                                     \
+        auto out_eigen = EigenTensor<Tout, Rank>::From(*out);             \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      } else {                                                            \
+        auto out_eigen = EigenTensor<Tout, Rank - 1>::From(*out);         \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      }                                                                   \
+    }                                                                     \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+struct VisitDataArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataArgMinMaxFunctor(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     int64_t axis,
+                                     bool keepdims,
+                                     bool flatten,
+                                     DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+  template <typename Tout>
+  void apply() const {
+    dev_ctx.template Alloc<Tout>(out);
+    bool new_keepdims = keepdims;
+    if (flatten) new_keepdims = true;
+
+    // if flatten, will construct the new dims for the cacluate
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x_dims.size();
+    }
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
+  ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
+  functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
+
+    switch (x_dims.size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            x_dims.size(),
+            6,
+            phi::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+void ArgMinMaxKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int64_t axis,
+                     bool keepdims,
+                     bool flatten,
+                     int dtype,
+                     DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e60847232c70b2af9382331809d73a7208fc956
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullAssign(Type input_height,
+                       Type input_width,
+                       int input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* t_out) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  auto in_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  dev_ctx.template Alloc<T>(in_grad);
+  auto dxt = EigenVector<T>::Flatten(*in_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  // Do full assign
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &out_grad,
+                           &indices,
+                           in_grad->data<T>());
+  } else {
+    // If not full assign do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &trans_dO,
+                           &trans_ind,
+                           t_out);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(argsort_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_kernel.cc b/paddle/phi/kernels/cpu/argsort_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e69afe38c9ad4d1ccfbd42fcde06562d97a4e3f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullSort(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     bool descending) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                if (descending)
+                  return l.first > r.first;
+                else
+                  return l.first < r.first;
+              });
+
+    for (Type j = 0; j < input_width; ++j) {
+      t_out[i * input_width + j] = col_vec[j].first;
+      t_indices[i * input_width + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  T* out_data = dev_ctx.template Alloc<T>(output);
+
+  // Do full sort
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &input,
+                         out_data,
+                         ids_data,
+                         descending);
+  } else {
+    // If not full sort do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_dims);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         descending);
+
+    dev_ctx.template Alloc<int64_t>(indices);
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    argsort, CPU, ALL_LAYOUT, phi::ArgsortKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
index 6ff7431f0c8c556770b54e1328251e5996850fc9..7a519aab0ad71e4cd20270b216bf65262cab8ba6 100644
--- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
index eb38a6c90b7938ef16cf9d56dfdb93903cc3c6a1..df6f5f59ac0056f36749faec8a300c1b5a1da1c9 100644
--- a/paddle/phi/kernels/cpu/atan2_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/auc_kernel.cc b/paddle/phi/kernels/cpu/auc_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc25091de757d029ced9babab9bcc55f1d2a10a6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/auc_kernel.cc
@@ -0,0 +1,190 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/auc_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) {
+  return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+}
+
+template <typename T>
+void statAuc(const DenseTensor &label,
+             const DenseTensor &predict,
+             const int num_thresholds,
+             const int slide_steps,
+             int64_t *origin_stat_pos,
+             int64_t *origin_stat_neg) {
+  size_t batch_size = predict.dims()[0];
+  size_t inference_width = predict.dims()[1];
+  const T *inference_data = predict.data<T>();
+  const auto *label_data = label.data<int64_t>();
+  const int bucket_length = num_thresholds + 1;
+  if (slide_steps == 0) {
+    for (size_t i = 0; i < batch_size; i++) {
+      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
+      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
+      auto predict_data =
+          inference_data[i * inference_width + (inference_width - 1)];
+      PADDLE_ENFORCE_LE(predict_data,
+                        1,
+                        phi::errors::PreconditionNotMet(
+                            "The predict data must less or equal 1."));
+      PADDLE_ENFORCE_GE(predict_data,
+                        0,
+                        phi::errors::PreconditionNotMet(
+                            "The predict data must gather or equal 0."));
+
+      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+      if (label_data[i] > 0) {
+        origin_stat_pos[binIdx] += 1;
+      } else if (label_data[i] == 0) {
+        origin_stat_neg[binIdx] += 1;
+      }
+    }
+    return;
+  }
+  // the last number of origin_stat_pos store the index should be used in
+  // current step
+  int cur_step_index =
+      static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
+      slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  for (int i = 0; i < bucket_length; ++i) {
+    origin_stat_pos[sum_step_begin + i] -= origin_stat_pos[cur_step_begin + i];
+    origin_stat_neg[sum_step_begin + i] -= origin_stat_neg[cur_step_begin + i];
+  }
+
+  std::memset(
+      origin_stat_pos + cur_step_begin, 0, bucket_length * sizeof(int64_t));
+  std::memset(
+      origin_stat_neg + cur_step_begin, 0, bucket_length * sizeof(int64_t));
+
+  for (size_t i = 0; i < batch_size; i++) {
+    // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
+    // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
+    auto predict_data =
+        inference_data[i * inference_width + (inference_width - 1)];
+    PADDLE_ENFORCE_LE(predict_data,
+                      1,
+                      phi::errors::PreconditionNotMet(
+                          "The predict data must less or equal 1."));
+    PADDLE_ENFORCE_GE(predict_data,
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "The predict data must gather or equal 0."));
+
+    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+    if (label_data[i] > 0) {
+      origin_stat_pos[cur_step_begin + binIdx] += 1;
+    } else if (label_data[i] == 0) {
+      origin_stat_neg[cur_step_begin + binIdx] += 1;
+    }
+  }
+  for (int i = 0; i < bucket_length; ++i) {
+    origin_stat_pos[sum_step_begin + i] += origin_stat_pos[cur_step_begin + i];
+    origin_stat_neg[sum_step_begin + i] += origin_stat_neg[cur_step_begin + i];
+  }
+}
+
+inline static void calcAuc(const int64_t *stat_pos,
+                           const int64_t *stat_neg,
+                           int num_thresholds,
+                           double *auc) {
+  *auc = 0.0f;
+
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+
+  int idx = num_thresholds;
+
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += stat_pos[idx];
+    totNeg += stat_neg[idx];
+    *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    *auc = *auc / totPos / totNeg;
+  }
+}
+
+template <typename T, typename Context>
+void AucKernel(const Context &dev_ctx,
+               const DenseTensor &input,
+               const DenseTensor &label,
+               const DenseTensor &stat_pos,
+               const DenseTensor &stat_neg,
+               const std::string &curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor *auc,
+               DenseTensor *stat_pos_out,
+               DenseTensor *stat_neg_out) {
+  // Only use output var for now, make sure it's persistable and
+  // not cleaned up for each batch.
+  auto *origin_stat_pos = dev_ctx.template Alloc<int64_t>(stat_pos_out);
+  auto *origin_stat_neg = dev_ctx.template Alloc<int64_t>(stat_neg_out);
+  auto *auc_value = dev_ctx.template Alloc<double>(auc);
+
+  // Just for pass UT, since UT's input & output connot be set same var
+  auto *stat_pos_in_tensor = &stat_pos;
+  auto *stat_neg_in_tensor = &stat_neg;
+  auto *pos_in_data = stat_pos.data<int64_t>();
+  auto *neg_in_data = stat_neg.data<int64_t>();
+  if (stat_pos_in_tensor != stat_pos_out) {
+    memcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t));
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    memcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t));
+  }
+  statAuc<T>(label,
+             input,
+             num_thresholds,
+             slide_steps,
+             origin_stat_pos,
+             origin_stat_neg);
+
+  int sum_offset = slide_steps * (num_thresholds + 1);
+  calcAuc(origin_stat_pos + sum_offset,
+          origin_stat_neg + sum_offset,
+          num_thresholds,
+          auc_value);
+  if (slide_steps) {
+    origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1;
+    origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(auc, CPU, ALL_LAYOUT, phi::AucKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de2343a384a5b413591fed981dc03e97bbb89fed
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -0,0 +1,674 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad) {
+  const auto* d_y = &y_grad;
+
+  DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  // batch_norm with inplace as false will take X as grad input, which
+  // is same as cuDNN batch_norm backward calculation, batch_norm
+  // with inplace as true only take Y as input and X should be calculate
+  // by inverse operation of batch_norm on Y
+
+  if (is_inplace) {
+    if (d_x) {
+      PADDLE_ENFORCE_EQ(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  } else {
+    if (d_x) {
+      PADDLE_ENFORCE_NE(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  }
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded as
+  // NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  T* d_bias_data = nullptr;
+  T* d_scale_data = nullptr;
+  if (d_scale && d_bias) {
+    d_bias_data = ctx.template Alloc<T>(d_bias);
+    d_scale_data = ctx.template Alloc<T>(d_scale);
+  }
+
+  // d_bias = np.sum(d_y, axis=0)
+  // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+  //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+  EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+  EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
+
+  if (d_scale && d_bias) {
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+  }
+
+  if (d_x && (N * sample_size) == 1 && !use_global_stats) {
+    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    return;
+  }
+
+  int scale_coefff = use_global_stats ? 1 : N * sample_size;
+  const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
+
+  DenseTensor dy_sum;
+  dy_sum.Resize({C});
+  auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
+  EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
+
+  DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
+  dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
+  auto dy_mul_x_sub_mean_mul_invstd_sum_data =
+      ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
+  EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
+      dy_mul_x_sub_mean_mul_invstd_sum_data, C);
+
+  dy_sum_arr.setZero();
+  dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
+
+  // inplace calculation
+  // Y:  ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  // X: (y - bias) / scale / (inv_var) + est_mean
+  //   formula transform ====>
+  //    (y - bias) / (scale * inv_var) + est_mean
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
+                               scale_inv_var_nhw(nc % C) / scale_coefff +
+                           mean_arr(nc % C);
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dy_sum_arr(c) += d_y_arr.col(nc).sum();
+        dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
+            ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                .sum();
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), sample_size, N * C);
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) =
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
+        }
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
+        for (int nhw = 0; nhw < N * sample_size; nhw++) {
+          x_data.col(nhw) =
+              (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
+              mean_arr;
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+
+      for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+        dy_sum_arr += d_y_arr.col(nhw);
+        dy_mul_x_sub_mean_mul_invstd_sum_arr +=
+            (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), C, N * sample_size);
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) =
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
+                 (x_arr.col(nhw) - mean_arr) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
+        }
+      }
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                data_layout_str));
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad) {
+  const auto* X = &x;
+  const auto* Scale = &scale;
+  const auto* dY = &y_grad;
+  const auto* Saved_mean = &saved_mean;
+  const auto* Saved_variance = &saved_variance;
+
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const auto data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto* ddX = &x_grad_grad;
+  const auto* ddScale = &scale_grad_grad;
+  const auto* ddBias = &bias_grad_grad;
+
+  auto* dX = x_grad;
+  auto* dScale = scale_grad;
+  auto* ddY = y_grad_grad;
+  ctx.template Alloc<T>(dX);
+  ctx.template Alloc<T>(ddY);
+
+  const auto& x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = X->numel() / C;
+  phi::funcs::SetConstant<Context, T> set_constant;
+
+  const T* mean_data = Saved_mean->data<T>();
+  const T* inv_var_data = Saved_variance->data<T>();
+
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  // transpose NCHW -> NHWC for easy calculate
+  DenseTensor transformed_x(X->type());
+  DenseTensor transformed_dy(dY->type());
+  DenseTensor transformed_ddx(ddX->type());
+
+  DenseTensor transformed_dx(dX->type());
+  DenseTensor transformed_ddy(ddY->type());
+  if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    // Input Tensor
+    ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
+    TransToChannelLast<Context, T>(ctx, X, &transformed_x);
+    ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    // Output Tensor
+    ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
+    ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
+  } else {
+    transformed_x.ShareDataWith(*X);
+    transformed_dy.ShareDataWith(*dY);
+    transformed_ddx.ShareDataWith(*ddX);
+
+    transformed_dx.ShareDataWith(*dX);
+    transformed_ddy.ShareDataWith(*ddY);
+  }
+
+  ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  Tensor mean_tile;
+  mean_tile.Resize({C, sample_size});
+  EigenArrayMap<T> mean_tile_data(
+      ctx.template Alloc<T>(&mean_tile), C, sample_size);
+
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({C, sample_size});
+  EigenArrayMap<T> inv_var_tile_data(
+      ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
+
+  mean_tile_data = mean_arr.replicate(1, sample_size);
+  inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    ctx.template Alloc<T>(&Scale_data);
+    set_constant(ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  Tensor scale_tile;
+  scale_tile.Resize({C, sample_size});
+  EigenArrayMap<T> scale_tile_data(
+      ctx.template Alloc<T>(&scale_tile), C, sample_size);
+  scale_tile_data = scale_arr.replicate(1, sample_size);
+
+  ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+  ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({C, sample_size});
+
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dX) {
+    ctx.template Alloc<T>(dX);
+    EigenArrayMap<T> dx_arr(
+        ctx.template Alloc<T>(&transformed_dx), C, sample_size);
+    dx_arr.setZero();
+    if (use_global_stats) {
+      // math: dx = (ddscale * dy) * inv_var
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+      }
+    } else {
+      // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+      // axis=(n,h,w)) *
+      //          np.sum(dy, axis=(n,h,w)) -
+      //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+      //          mean),
+      //          axis=(n,h,w)) * inv_var.pow(2) *
+      //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+      //          NxHxW *
+      //          np.sum(ddx * (x - mean)) *
+      //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+      //          np.sum(dy,
+      //          axis=(n,h,w)) * (x - mean) *
+      //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
+      //          inv_var
+      //          *
+      //          np.mean(dy, axis=(n,h,w)) -
+      //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+      //          axis=(n,h,w)))
+
+      if (ddX) {
+        dx_arr +=
+            (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+             sample_size)
+                .colwise() *
+            (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+             (dy_arr * ddx_arr).rowwise().sum() +
+             3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                 sample_size);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size *
+                  (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+        dx_arr = scale_tile_data * dx_arr;
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr +=
+            (dy_arr * inv_var_tile_data -
+             (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
+                 inv_var_tile_data -
+             x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size) *
+            ddscale_tile_data;
+      }
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
+    }
+  }
+  if (dScale) {
+    EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
+    dscale_arr.setZero();
+    if (use_global_stats) {
+      // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+      if (ddX) {
+        dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+      }
+    } else {
+      // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+      //            ddx
+      if (ddX) {
+        Tensor first_grad;
+        first_grad.Resize({C, sample_size});
+        EigenArrayMap<T> first_grad_arr(
+            ctx.template Alloc<T>(&first_grad), C, sample_size);
+        first_grad_arr.setZero();
+
+        first_grad_arr +=
+            inv_var_tile_data *
+            (dy_arr -
+             dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+        dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+      }
+    }
+  }
+
+  if (ddY) {
+    ctx.template Alloc<T>(ddY);
+    EigenArrayMap<T> ddy_arr(
+        ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
+    ddy_arr.setZero();
+    if (use_global_stats) {
+      // math: ddy = r * ddx * inv_var + ddbias +
+      //           ddscale * (x - mean) * inv_var
+      if (ddX) {
+        ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+      }
+    } else {
+      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+      //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+      if (ddX) {
+        ddy_arr +=
+            scale_tile_data * inv_var_tile_data *
+            (ddx_arr -
+             ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+      }
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      Tensor ddscale_tile;
+      ddscale_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddscale_tile_data(
+          ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+      ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+    }
+
+    if (ddBias) {
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      Tensor ddbias_tile;
+      ddbias_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddbias_tile_data(
+          ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
+      ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+      ddy_arr += ddbias_tile_data;
+    }
+
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..743128e8dea99296def25f79c47bbcfda8c65f40
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space) {
+  bool test_mode = is_test && (!trainable_statistics);
+
+  bool global_stats = test_mode || use_global_stats;
+
+  auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensionss is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // alloc memory
+  ctx.template Alloc<T>(y);
+  ctx.template Alloc<T>(mean_out);
+  ctx.template Alloc<T>(variance_out);
+  ctx.template Alloc<T>(saved_mean);
+  ctx.template Alloc<T>(saved_variance);
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded
+  // as NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  if (!global_stats) {
+    // saved_xx is use just in this batch of data
+    EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
+    EigenVectorArrayMap<T> saved_variance_e(
+        ctx.template Alloc<T>(saved_variance), C);
+    saved_mean_e.setZero();
+    saved_variance_e.setZero();
+
+    EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
+    EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
+                                           C);
+
+    if ((N * sample_size) == 1) {
+      // Only 1 element in normalization dimension,
+      // we skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      return;
+    }
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_mean_e(nc % C) += x_arr.col(nc).sum();
+        }
+        saved_mean_e /= N * sample_size;
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_variance_e(nc % C) +=
+              (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_mean_e += x_arr.col(i);
+        }
+        saved_mean_e /= N * sample_size;
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_variance_e +=
+              (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                  data_layout_str));
+    }
+
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    running_mean_arr =
+        running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+    running_var_arr =
+        running_var_arr * momentum + saved_variance_e * (1. - momentum);
+  }
+
+  // use SavedMean and SavedVariance to do normalize
+  Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+  if (global_stats) {
+    ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
+    inv_std = (var_arr + epsilon).sqrt().inverse();
+  } else {
+    EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
+    // inverse SavedVariance first, gradient will use it too.
+    saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+    inv_std = saved_inv_std;
+  }
+  ConstEigenVectorArrayMap<T> mean_arr(
+      global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
+
+  //   ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+  Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+      bias_arr - mean_arr * inv_std * scale_arr;
+
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
+          (ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
+           new_scale)
+              .colwise() +
+          new_bias;
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
+                                                data_layout));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9dc44c1e04eb750b7b35eba7241f4d4af40062a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bincount_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename InputT>
+void BincountInner(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const paddle::optional<const DenseTensor&> weights,
+                   int minlength,
+                   DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  auto input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<InputT>(output);
+    return;
+  }
+
+  PADDLE_ENFORCE_GE(
+      *std::min_element(input_data, input_data + input_numel),
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size = static_cast<int64_t>(*std::max_element(
+                            input_data, input_data + input_numel)) +
+                        1L;
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  if (has_weights) {
+    const T* weights_data = weights->data<T>();
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
+      }
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
+      }
+    }
+
+  } else {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+    for (int64_t i = 0; i < input_numel; i++) {
+      output_data[input_data[i]] += 1L;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/bitwise_kernel.cc b/paddle/phi/kernels/cpu/bitwise_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69f52790f77969e8bf29fcb50b777afe504215b7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bitwise_kernel.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/bitwise_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/bitwise_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_BITWISE_KERNEL(op_type)                                    \
+  template <typename T, typename Context>                                 \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,                   \
+                                const DenseTensor& x,                     \
+                                const DenseTensor& y,                     \
+                                DenseTensor* out) {                       \
+    funcs::Bitwise##op_type##Functor<T> func;                             \
+    funcs::ElementwiseCompute<funcs::Bitwise##op_type##Functor<T>, T, T>( \
+        dev_ctx, x, y, -1, func, out);                                    \
+  }
+
+DEFINE_BITWISE_KERNEL(And)
+DEFINE_BITWISE_KERNEL(Or)
+DEFINE_BITWISE_KERNEL(Xor)
+#undef DEFINE_BITWISE_KERNEL
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  size_t numel = x.numel();
+  funcs::BitwiseNotFunctor<T> func;
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x_data, x_data + numel, out_data, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bitwise_and,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseAndKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_or,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseOrKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_xor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseXorKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_not,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseNotKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0869cd62024dc9e11ce1e1a1fc5349c0c966ef9e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_RESHAPE_DIMS(n)                                                \
+  case n: {                                                                   \
+    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
+    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
+      reshape_dims[i] = reshape_dims_vec[i];                                  \
+    }                                                                         \
+    dX.device(place) =                                                        \
+        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
+    break;                                                                    \
+  }
+
+#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
+  case m: {                                               \
+    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
+    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
+      reduce_dims[i] = reduce_dims_vec[i];                \
+    }                                                     \
+    switch (reshape_size) {
+#define LOWER_SWITCH_REDUCE_DIMS                             \
+  default: {                                                 \
+    PADDLE_THROW(errors::InvalidArgument(                    \
+        "Detected reshape size: %d out of range"             \
+        "Minimum value should be larger than reduce size %d" \
+        "While maximum supported is: 5",                     \
+        reshape_size,                                        \
+        reduce_size));                                       \
+  }                                                          \
+    }                                                        \
+    break;                                                   \
+    }
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<const DenseTensor*>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs, but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    const auto* input_tensor = in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const auto& input_dims = input_tensor->dims();
+    const auto& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
+    // Here we perform the following Eigen operations:
+    // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+    // reshape(dX_shape) -> dX
+    // Note the last "reshape(dX_shape)" will be performed implicitly,
+    // and we only need to collect reduce_dims and reshape_dims
+    std::vector<int> reduce_dims_vec;
+    std::vector<int> reshape_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      reshape_dims_vec.push_back(input_dims[j]);
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    size_t reduce_size = reduce_dims_vec.size();
+    size_t reshape_size = reshape_dims_vec.size();
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // If this turns out to be a No-Op, simply perform a tensor copy
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      PADDLE_ENFORCE_GE(
+          reduce_dims_vec.size(),
+          1,
+          errors::InvalidArgument("The number of dimensions of the input "
+                                  "'Out@GRAD' for Op(broadcast_tensors)"
+                                  " must be greater than or equal to 1, but "
+                                  "the value received is %d.",
+                                  reduce_dims_vec.size()));
+      PADDLE_ENFORCE_LE(
+          reduce_dims_vec.size(),
+          5,
+          errors::InvalidArgument(
+              "The number of dimensions of the input 'Out@GRAD' "
+              "for Op(broadcast_tensors) must be less than or equal "
+              "to 5, but the value received is %d.",
+              reduce_dims_vec.size()));
+
+      // Overall:
+      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+      // reshape(dX_shape) -> dX
+      auto dX = EigenVector<T>::Flatten(*output_tensor);
+      auto dOut = EigenVector<T>::Flatten(*input_tensor);
+      auto& place = *ctx.eigen_device();
+
+      // Expand ReduceSize and ReshapeSize into static values
+      switch (reduce_size) {
+        UPPER_SWITCH_REDUCE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(5)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        default: {
+          PADDLE_THROW(
+              errors::InvalidArgument("Detected reduce size: %d out of range"
+                                      "While maximum supported is: 5",
+                                      reduce_size));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cb6db876927142baac0ba0cde3438a4e3b00159
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f5dd29ba2b7a31b8346d3d25148852344221a0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02597560a7f51f2df6173215b09536292a70ef3f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+class CholeskySolveFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    char uplo = upper ? 'U' : 'L';
+    funcs::lapackCholeskySolve<T>(uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky_solve, CPU, ALL_LAYOUT, phi::CholeskySolveKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9006325a521ec88eba80a9bf27762730be08fecc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/compare_kernel.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out) {
+  ctx.template Alloc<bool>(out);
+  if (x.dims().size() >= y.dims().size()) {
+    funcs::ElementwiseCompute<Functor, T, bool>(
+        ctx, x, y, axis, Functor(), out);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T, bool>(
+        ctx, x, y, axis, InverseFunctor(), out);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  bool* out_data = ctx.template Alloc<bool>(out);
+
+  if (x.dims() != y.dims()) {
+    out_data[0] = false;
+  } else {
+    DenseTensor tmp;
+    tmp.Resize(x.dims());
+    ctx.template Alloc<bool>(&tmp);
+
+    if (x.numel() == 1 && y.numel() == 1) {
+      bool* tmp_data = tmp.data<bool>();
+      tmp_data[0] = Functor()(x.data<T>()[0], y.data<T>()[0]);
+    } else {
+      funcs::ElementwiseCompute<Functor, T, bool>(
+          ctx, x, y, 0, Functor(), &tmp);
+    }
+    auto tmp_flat = EigenVector<bool>::Flatten(tmp);
+    auto out_es = EigenScalar<bool>::From(*out);
+    auto& place = *ctx.eigen_device();
+    auto reduce_dim = Eigen::array<int, 1>({{0}});
+    out_es.device(place) = tmp_flat.all(reduce_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(less_than,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LessThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(less_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LessEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_than,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GreaterThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GreaterEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(not_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NotEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(equal_all,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EqualAllKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 801502e16737d1ef5ffa475916d5e144d2e8d86b..859d5a84527a2e342b6f0f25999f8fc3d6f8fa3e 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -37,11 +37,15 @@ PD_REGISTER_KERNEL(real,
                    ALL_LAYOUT,
                    phi::RealKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
 
 PD_REGISTER_KERNEL(imag,
                    CPU,
                    ALL_LAYOUT,
                    phi::ImagKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 18bb8837b105d91e3e13a0a7519b08c9c47202c4..6be825d4ef14e8e9aabf9c1b5b804c3ff5a18347 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -29,16 +29,17 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis_scalar,
                   DenseTensor* out) {
   int64_t axis = axis_scalar.to<int64_t>();
 
-  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
 
   std::vector<phi::DDim> x_dims;
+  x_dims.reserve(x.size());
   for (size_t i = 0; i < x.size(); ++i) {
-    x_dims.push_back(x[i].dims());
+    x_dims.push_back(x[i]->dims());
   }
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
@@ -46,13 +47,13 @@ void ConcatKernel(const Context& dev_ctx,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
-  if (axis == 0 && x[0].lod().size() > 0) {
-    size_t lod_size_0 = x[0].lod().size();
+  if (axis == 0 && x[0]->lod().size() > 0) {
+    size_t lod_size_0 = x[0]->lod().size();
     size_t lod_size = lod_size_0;
     for (size_t i = 1; i < x.size(); ++i) {
-      if (x[i].lod().size() > 0) {
+      if (x[i]->lod().size() > 0) {
         PADDLE_ENFORCE_EQ(
-            x[i].lod().size(),
+            x[i]->lod().size(),
             lod_size_0,
             phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
@@ -60,7 +61,7 @@ void ConcatKernel(const Context& dev_ctx,
                 "it is not supported currently. The lod level of %dth input "
                 "is %d and first input is %d.",
                 i,
-                x[i].lod().size(),
+                x[i]->lod().size(),
                 lod_size_0));
       } else {
         lod_size = 0;
@@ -70,7 +71,7 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod());
+        auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod());
         phi::AppendLoD(out_lod, in_lod);
       }
     }
@@ -79,27 +80,29 @@ void ConcatKernel(const Context& dev_ctx,
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && x.size() < 10) {
     size_t output_offset = 0;
-    for (auto& in : x) {
-      if (in.numel() == 0UL) {
+    for (const auto* in : x) {
+      if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in.dims());
+      auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
       paddle::operators::StridedNumelCopyWithAxis<T>(
           dev_ctx,
           axis,
           out->data<T>() + output_offset,
           out_stride,
-          in.data<T>(),
+          in->data<T>(),
           in_stride,
           in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
+    // TODO(chenweihang): concat functor support vector<DenseTensor*> input
     std::vector<phi::DenseTensor> inputs;
+    inputs.reserve(x.size());
     for (size_t j = 0; j < x.size(); ++j) {
-      if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+      if (x[j]->numel() > 0) {
+        inputs.emplace_back(*x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f157bb017f81c5636a87ddcecb82e977e4fd18ba
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings_t,
+                          const std::string& padding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations_t,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search_t,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad) {
+  ConvGradGradKernel<T>(ctx,
+                        input_grad_grad,
+                        filter_grad_grad,
+                        out_grad,
+                        input,
+                        filter,
+                        strides,
+                        paddings_t,
+                        padding_algorithm,
+                        groups,
+                        dilations_t,
+                        data_format,
+                        use_addto,
+                        workspace_size_MB,
+                        exhaustive_search_t,
+                        out_grad_grad,
+                        input_grad,
+                        filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3DGradGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..994ad861bd15b747524d7e0f47a0ed5b8ee465cd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(
+    conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0b4ee7d5776fdaf51955b2d35bb339735411a28
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d26d89086b27e3db8ccfcf339c51c6a2fdf1988a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+template <typename T = int>
+inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
+                                     std::vector<T>* dilation,
+                                     const std::string padding_algorithm,
+                                     const DDim data_dims,
+                                     const std::vector<T>& strides,
+                                     const std::vector<T>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2,
+        paddings->size(),
+        phi::errors::InvalidArgument(
+            "Attribute padding's size should be the same or twice as the "
+            "input's dimension. "
+            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "dimension is %d, input's shape is [%s].",
+            paddings->size(),
+            make_ddim(*paddings),
+            data_dims.size(),
+            data_dims));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+
+      // dilation
+      *(dilation->begin() + i) = 1;
+    }
+
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  if (paddings.size() != strides.size()) {
+    for (size_t j = 0; j < paddings.size(); ++j) {
+      padding_0 = padding_0 && (paddings[j] == 0);
+    }
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/cumsum_kernel.cc b/paddle/phi/kernels/cpu/cumsum_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d32e18479aae924cc06eee8daaa760226ddd6980
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumsum_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+struct CumsumFunctor {
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x,
+                                               int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+template <typename Device, typename Dim, typename X, typename Out>
+void ComputeImp(Device d,
+                const Dim& dims,
+                X x,
+                Out out,
+                int axis,
+                bool reverse,
+                bool exclusive) {
+  if (!reverse) {
+    out.reshape(dims).device(d) =
+        CumsumFunctor()(x.reshape(dims), axis, exclusive);
+  } else {
+    std::array<bool, Dim::count> rev;
+    rev.fill(false);
+    rev[axis] = reverse;
+    out.reshape(dims).device(d) =
+        CumsumFunctor()(x.reshape(dims).reverse(rev), axis, exclusive)
+            .reverse(rev);
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  auto out_dims = out->dims();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  int pre = 1;
+  int post = 1;
+  int mid = out_dims[axis];
+  for (int i = 0; i < axis; ++i) {
+    pre *= out_dims[i];
+  }
+  for (int i = axis + 1; i < out_dims.size(); ++i) {
+    post *= out_dims[i];
+  }
+
+  auto x0 = EigenVector<T>::Flatten(x);
+  auto out0 = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  using IndexT = Eigen::DenseIndex;
+  if (pre == 1) {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 1>(mid),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive);
+    }
+  } else {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(pre, mid),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 3>(pre, mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b7f8f98f9473c03e26d6edaebb7dd04e0428072
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccf3d4be8323090985a5c7a4eaf0ed8efcfaf5de
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b77a6c55b14716e2747a2cb76d4b1bda380a2d02
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  auto* grad_x = x_grad;
+  auto* grad_y = &out_grad;
+  grad_x->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto dX = EigenVector<T>::Flatten(*grad_x);
+  auto dY = EigenVector<T>::Flatten(*grad_y);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto& dropout_implementation = mode;
+  if (is_test == true) {
+    if (dropout_implementation == "upscale_in_train") {
+      dX.device(place) = static_cast<T>(1) * dY;
+    } else {
+      dX.device(place) = dY * static_cast<T>(1.0f - p);
+    }
+  } else {
+    auto M = EigenVector<uint8_t>::Flatten(mask);
+    if (dropout_implementation == "upscale_in_train") {
+      if (p == 1.0f) {
+        dX.device(place) = static_cast<T>(0) * dY;
+      } else {
+        dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+      }
+    } else {
+      dX.device(place) = dY * M.cast<T>();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c00aedef8c67d5b88efb29c746c70f5f7859507a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_kernel.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  auto* y = out;
+  const auto* x_data = x.data<T>();
+  auto* y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+
+  auto& dropout_implementation = mode;
+  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+  if (!is_test) {
+    auto* mask_data = mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+    size_t size = phi::product(mask->dims());
+
+    // Special case when dropout_prob is 1.0
+    if (dropout_prob == 1.0f) {
+      std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
+      std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
+      return;
+    }
+    // std::minstd_rand engine;
+    // NOTE: fixed seed should only be used in unittest or for debug.
+    // Guarantee to use random seed in training.
+    int seed_data = 0;
+    if (seed_tensor.get_ptr() != nullptr) {
+      seed_data = *(seed_tensor->data<int>());
+    } else {
+      seed_data = fix_seed ? seed : 0;
+    }
+    auto engine = paddle::framework::GetCPURandomEngine(seed_data);
+
+    std::uniform_real_distribution<float> dist(0, 1);
+
+    for (size_t i = 0; i < size; ++i) {
+      if (dist(*engine) < dropout_prob) {
+        mask_data[i] = 0;
+        y_data[i] = 0;
+      } else {
+        mask_data[i] = 1;
+        if (upscale_in_train) {
+          y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+        } else {
+          y_data[i] = x_data[i];
+        }
+      }
+    }
+  } else {
+    if (upscale_in_train) {
+      const auto* X_data = x.data<T>();
+      auto* Y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < x.numel(); i++) {
+        Y_data[i] = X_data[i];
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place = *dev_ctx.eigen_device();
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5135778db56c5a59b8d1e2cebf49ba2595070f7f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92fd20ca9b8251a3ba1b493f8cd35a803e20dfba
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index 28bf5ab743f6d5d0608fe65c00d5a0de2af3415b..0f67df661136dc659c28da3855b661e4a7df2af0 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -189,250 +189,6 @@ struct SameDimsMultiplyFunctor<
   }
 };
 
-inline void UpdateElementwiseIndexArray(const int* out_dims_array,
-                                        const int max_dim,
-                                        int* index_array) {
-  for (int i = max_dim - 1; i >= 0; --i) {
-    ++index_array[i];
-    if (index_array[i] >= out_dims_array[i]) {
-      index_array[i] -= out_dims_array[i];
-    } else {
-      break;
-    }
-  }
-}
-
-inline int GetElementwiseIndex(const int* x_dims_array,
-                               const int max_dim,
-                               const int* index_array) {
-  int index_ = 0;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] > 1) {
-      index_ = index_ * x_dims_array[i] + index_array[i];
-    }
-  }
-  return index_;
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonGradBroadcastCPU(const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& out,
-                            const DenseTensor& dout,
-                            DenseTensor* dx,
-                            DenseTensor* dy,
-                            int* x_dims_array,
-                            int* y_dims_array,
-                            int* out_dims_array,
-                            int max_dim,
-                            const CPUContext& ctx,
-                            DX_OP dx_op,
-                            DY_OP dy_op) {
-  std::vector<int> index_array(max_dim, 0);
-  const T* x_data = x.data<T>();
-  const T* y_data = y.data<T>();
-  const Tout* out_data = out.data<Tout>();
-  const Tout* dout_data = dout.data<Tout>();
-  T* dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
-  T* dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
-  if (dx_data != nullptr) {
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-  }
-  if (dy_data != nullptr) {
-    memset(dy_data, 0, dy->numel() * sizeof(T));
-  }
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (dx_data != nullptr) {
-      dx_data[x_index] += dx_op(x_data[x_index],
-                                y_data[y_index],
-                                out_data[out_index],
-                                dout_data[out_index]);
-    }
-    if (dy_data != nullptr) {
-      dy_data[y_index] += dy_op(x_data[x_index],
-                                y_data[y_index],
-                                out_data[out_index],
-                                dout_data[out_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const DenseTensor& x,
-                               const DenseTensor& y,
-                               DenseTensor* z,
-                               int* x_dims_array,
-                               int* y_dims_array,
-                               int* out_dims_array,
-                               int max_dim,
-                               const CPUContext& ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  std::vector<int> index_array(max_dim, 0);
-  const T* x_data = x.data<T>();
-  const T* y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(
-      x_data, phi::errors::InvalidArgument("The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(
-      y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
-  OutType* out_data = ctx.Alloc<OutType>(z);
-
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (is_xsize_larger) {
-      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
-    } else {
-      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
-                                       const DenseTensor& x,
-                                       const DenseTensor& y,
-                                       DenseTensor* z,
-                                       const DDim& x_dims,
-                                       const DDim& y_dims,
-                                       Functor func,
-                                       int axis,
-                                       const bool is_xsize_larger = true) {
-  int max_dim = (std::max)(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
-
-  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
-                                                 y,
-                                                 z,
-                                                 x_dims_array.data(),
-                                                 y_dims_array.data(),
-                                                 out_dims_array.data(),
-                                                 max_dim,
-                                                 dev_ctx,
-                                                 func,
-                                                 is_xsize_larger);
-}
-
-// It is a common CPU implementation to compute binary calculation with the
-// support of broadcast. Note:
-// 1. CPU implementation cannot support the case when x needs broadcast, thus
-//    this function need to be called with XxxFunctor and XxxInverseFunctor,
-//    like AddFunctor and InverseAddFunctor.
-// 2. The corresponding GPU implementation supports all the broadcast cases,
-//    thus there is no need to define and call with XxxInverseFunctor.
-// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
-// cases and avoid the need of XxxInverseFunctor.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const CPUContext& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        Functor func,
-                        DenseTensor* z) {
-  dev_ctx.Alloc<OutType>(z);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-  funcs::TransformFunctor<Functor, T, CPUContext, OutType> functor(
-      x, y, z, dev_ctx, func, is_xsize_larger);
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  }
-  // special case for common implementation.
-  // case 1: x=[2,3,1,5], y=[2,1,4,1]
-  // case 2: x=[2,3,4], y=[1,1,4]
-  if (is_run_common_broadcast == 1) {
-    CommonElementwiseBroadcastForward<Functor, T, OutType>(
-        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
-    return;
-  }
-
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
 template <typename Functor>
 struct SameDimsElementwiseCompute {
   void operator()(const CPUContext& dev_ctx,
@@ -443,377 +199,4 @@ struct SameDimsElementwiseCompute {
   }
 };
 
-// BACKWARD CODE
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast1CPU(const T* x,
-                                      const T* y,
-                                      const Tout* out,
-                                      const Tout* dout,
-                                      int h,
-                                      int w,
-                                      bool is_xsize_larger,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op,
-                                      T* dx,
-                                      T* dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int x_offset = i * w + j;
-        if (dx != nullptr) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-        }
-        if (dy != nullptr) {
-          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          if (i == 0) {
-            dy[j] = tmp;
-          } else {
-            dy[j] += tmp;
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int y_offset = i * w + j;
-        if (dy != nullptr) {
-          dy[y_offset] =
-              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-        }
-        if (dx != nullptr) {
-          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          if (i == 0) {
-            dx[j] = tmp;
-          } else {
-            dx[j] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast2CPU(const T* x,
-                                      const T* y,
-                                      const Tout* out,
-                                      const Tout* dout,
-                                      int pre,
-                                      int n,
-                                      int post,
-                                      bool is_xsize_larger,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op,
-                                      T* dx,
-                                      T* dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int x_offset = i * n * post + j * post + k;
-          if (dx != nullptr) {
-            dx[x_offset] =
-                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          }
-          if (dy != nullptr) {
-            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-            if (i == 0 && k == 0) {
-              dy[j] = tmp;
-            } else {
-              dy[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int y_offset = i * n * post + j * post + k;
-          if (dy != nullptr) {
-            dy[y_offset] =
-                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          }
-          if (dx != nullptr) {
-            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-            if (i == 0 && k == 0) {
-              dx[j] = tmp;
-            } else {
-              dx[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
-                                        const DDim& x_dims,
-                                        const DDim& y_dims,
-                                        const DenseTensor& x,
-                                        const DenseTensor& y,
-                                        const DenseTensor& out,
-                                        const DenseTensor& dout,
-                                        int axis,
-                                        DenseTensor* dx,
-                                        DenseTensor* dy,
-                                        DX_OP dx_op,
-                                        DY_OP dy_op) {
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
-  // for inplace strategy. memset will make dx and dout clear and get wrong
-  // result.
-  if (dx && dx->IsSharedBufferWith(dout)) {
-    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
-  }
-
-  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << phi::make_ddim(x_dims_array)
-          << " ydim:" << phi::make_ddim(y_dims_array);
-
-  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
-                                                y,
-                                                out,
-                                                dout,
-                                                dx,
-                                                dy,
-                                                x_dims_array.data(),
-                                                y_dims_array.data(),
-                                                out_dims_array.data(),
-                                                max_dim,
-                                                ctx,
-                                                dx_op,
-                                                dy_op);
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
-                                      const DDim& x_dims,
-                                      const DDim& y_dims,
-                                      const DenseTensor& x,
-                                      const DenseTensor& y,
-                                      const DenseTensor& out,
-                                      const DenseTensor& dout,
-                                      int axis,
-                                      DenseTensor* dx,
-                                      DenseTensor* dy,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op) {
-  bool is_xsize_larger = true;
-
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  }
-  // special case for common backward implementation.
-  if (is_run_common_broadcast) {
-    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
-        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    return;
-  }
-  if (post == 1) {
-    ElemwiseGradBroadcast1CPU(x.data<T>(),
-                              y.data<T>(),
-                              out.data<Tout>(),
-                              dout.data<Tout>(),
-                              pre,
-                              n,
-                              is_xsize_larger,
-                              dx_op,
-                              dy_op,
-                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
-                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
-  } else {
-    ElemwiseGradBroadcast2CPU(x.data<T>(),
-                              y.data<T>(),
-                              out.data<Tout>(),
-                              dout.data<Tout>(),
-                              pre,
-                              n,
-                              post,
-                              is_xsize_larger,
-                              dx_op,
-                              dy_op,
-                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
-                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
-  }
-}
-
-// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
-// explicit gradient can cut off X, Y, Out from gradient op
-// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
-// elementwise code.
-template <typename T, typename DX_OP, typename DY_OP>
-void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 const DenseTensor& out,
-                                 const DenseTensor& dout,
-                                 int axis,
-                                 DenseTensor* dx,
-                                 DenseTensor* dy,
-                                 DX_OP dx_op,
-                                 DY_OP dy_op) {
-  const DDim& x_dim = x.dims();
-  const DDim& y_dim = y.dims();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(
-        dev_ctx,
-        x_dim,
-        y_dim,
-        dout,
-        dout,
-        out,
-        dout,
-        axis,
-        dx,
-        dy,
-        dx_op,
-        dy_op);
-  } else {
-    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
-                                                      x_dim,
-                                                      y_dim,
-                                                      dout,
-                                                      dout,
-                                                      out,
-                                                      dout,
-                                                      axis,
-                                                      dx,
-                                                      dy,
-                                                      dx_op,
-                                                      dy_op);
-  }
-}
-
-/*
-******************************
-    Add Grad
-******************************
-*/
-template <typename T>
-struct IdentityGrad {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_add_grad(const CPUContext& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     const DenseTensor& out,
-                     const DenseTensor& dout,
-                     DenseTensor* dx,
-                     DenseTensor* dy,
-                     int axis = -1) {
-  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
-  if (dx) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
-  }
-
-  if (dy) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
-  }
-}
-
-template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_add_grad(const CPUContext& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     const DenseTensor& out,
-                     const DenseTensor& dout,
-                     DenseTensor* dx,
-                     DenseTensor* dy,
-                     int axis = -1) {
-  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
-}
-
-/*
-******************************
-    Sub Grad
-******************************
-*/
-
-template <typename T>
-struct SubGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-template <typename T>
-void elementwise_sub_grad(const CPUContext& ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          const DenseTensor& out,
-                          const DenseTensor& dout,
-                          DenseTensor* dx,
-                          DenseTensor* dy,
-                          int axis = -1) {
-  ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
-      ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
-}
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..92587566eb87591c5c35572fcba8a39af8445f5a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_grad.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+
+namespace phi {
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& out,
+                                 const DenseTensor& dout,
+                                 int axis,
+                                 DenseTensor* dx,
+                                 DenseTensor* dy,
+                                 DX_OP dx_op,
+                                 DY_OP dy_op) {
+  const DDim& x_dim = x.dims();
+  const DDim& y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(dev_ctx,
+                                                                       x_dim,
+                                                                       y_dim,
+                                                                       dout,
+                                                                       dout,
+                                                                       out,
+                                                                       dout,
+                                                                       axis,
+                                                                       dx,
+                                                                       dy,
+                                                                       dx_op,
+                                                                       dy_op);
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
+                                                             x_dim,
+                                                             y_dim,
+                                                             dout,
+                                                             dout,
+                                                             out,
+                                                             dout,
+                                                             axis,
+                                                             dx,
+                                                             dy,
+                                                             dx_op,
+                                                             dy_op);
+  }
+}
+
+/*
+******************************
+    Add Grad
+******************************
+*/
+template <typename T>
+struct IdentityGrad {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+ElementwiseAddGrad(const CPUContext& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy,
+                   int axis = -1) {
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
+  if (dx) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value>::type
+ElementwiseAddGrad(const CPUContext& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy,
+                   int axis = -1) {
+  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
+}
+
+/*
+******************************
+    Sub Grad
+******************************
+*/
+
+template <typename T>
+struct SubGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+struct SubGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
+};
+
+template <typename T>
+void ElementwiseSubGrad(const CPUContext& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& out,
+                        const DenseTensor& dout,
+                        DenseTensor* dx,
+                        DenseTensor* dy,
+                        int axis = -1) {
+  ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index c878e8133ffc0dc0c5e4992b315af48bc6cdaf03..cd513e809fd84ace9b01b50aed537204b2be1684 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
@@ -33,7 +33,7 @@ void AddGradFunc(const CPUContext& dev_ctx,
                  DenseTensor* dy,
                  int axis = -1) {
   if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
   } else {
     ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
         dev_ctx,
@@ -68,15 +68,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx,
-                            y,
-                            ddx,
-                            ddy,
-                            dout,
-                            axis,
-                            ddout,
-                            ElementwiseCompute<funcs::AddFunctor<T>, T>,
-                            ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -101,7 +93,7 @@ void SubtractGradKernel(const Context& dev_ctx,
                         DenseTensor* dy) {
   // skip out
   auto* out = &dout;
-  elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
 }
 
 template <typename T, typename Context>
@@ -112,15 +104,35 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(
-      dev_ctx,
-      y,
-      ddx,
-      ddy,
-      dout,
-      axis,
-      ddout,
-      ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
 }
 
 }  // namespace phi
@@ -186,3 +198,64 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/arg_max_op.cu
rename to paddle/phi/kernels/cpu/erf_grad_kernel.cc
index 14708c4df10f5160d0e72e7669e0015554d8215f..3c1cd0df1531a50524958b527ee39b09460f042c 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
@@ -1,22 +1,27 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/erf_kernel.cc b/paddle/phi/kernels/cpu/erf_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05ce4cab7fcef4d1da7d378093f9f3d04827acd2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erf_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, CPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6eafe9aa49dfe820881ca1394716a29e0ced4ec4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..697ea138097ee9d57f23a11d7403b2d4b78158b6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 86576a861aa4834a4b39b50594565a2d4b3ac510..556de3adcf498a111bcaad2f5b849bfe79be8adf 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -54,12 +54,22 @@ void FullLikeKernel(const Context& dev_ctx,
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b375a7ec4691c723f2f029c39b7e364b8332c402
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto &place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::ScatterNdAdd<T, int32_t>(ctx, out_grad, index, x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::ScatterNdAdd<T, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa32d036934e838b7630a19a152e0c14de907253
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::CPUGatherNd<T, int>(ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::CPUGatherNd<T, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25fb870d851f673a9e66a76ec2ef74fad9acd12f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  auto &ids_dims = ids.dims();
+  auto max_length = ids_dims[0];
+  auto batch_size = ids_dims[1];
+  auto beam_size = ids_dims[2];
+
+  PADDLE_ENFORCE_NOT_NULL(ids_data,
+                          phi::errors::InvalidArgument(
+                              "Input(Ids) of gather_tree should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      parents_data,
+      phi::errors::InvalidArgument(
+          "Input(Parents) of gather_tree should not be null."));
+
+  for (int batch = 0; batch < batch_size; batch++) {
+    for (int beam = 0; beam < beam_size; beam++) {
+      auto idx =
+          (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
+      out_data[idx] = ids_data[idx];
+      auto parent = parents_data[idx];
+      for (int step = max_length - 2; step >= 0; step--) {
+        idx = step * batch_size * beam_size + batch * beam_size;
+        out_data[idx + beam] = ids_data[idx + parent];
+        parent = parents_data[idx + parent];
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gather_tree, CPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e336f18bf80a36d1c954533aa7dc2534c4f7f2c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  std::normal_distribution<T> dist(mean, std);
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+  int64_t size = tensor->numel();
+  T* data = dev_ctx.template Alloc<T>(tensor);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..df6d9c87be0ed582568684aaa0869342f10e47c7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct GraphSendRecvSumFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    eigen_dst += eigen_src;
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMinFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMin(eigen_src);
+    }
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMaxFunctor {
+  void operator()(const int& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMax(eigen_src);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+void ElementwiseInnerOperation(const DenseTensor& src,
+                               DenseTensor* dst,
+                               const IndexT& src_index,
+                               const IndexT& dst_index,
+                               const bool& first_flag,
+                               Functor functor) {
+  auto src_slice = src.Slice(src_index, src_index + 1);
+  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
+
+  functor(first_flag, src_slice, &dst_slice);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8538461b1b83b887223224bac231600221547f11
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuGradLoop(const int& input_size,
+                              const int& index_size,
+                              const IndexT* s_index,
+                              const IndexT* d_index,
+                              const DenseTensor& src,
+                              DenseTensor* dst,
+                              const std::string& pool_type,
+                              const int* dst_count = nullptr,
+                              const DenseTensor* input = nullptr,
+                              const DenseTensor* output = nullptr) {
+  if (pool_type == "SUM") {
+    Functor functor;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      auto src_slice = src.Slice(src_idx, src_idx + 1);
+      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& forward_src_idx = d_index[i];
+      const IndexT& forward_dst_idx = s_index[i];
+      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
+      auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
+
+      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += eigen_src * (eigen_output == eigen_input);
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+  } else if (pool_type == "MEAN") {
+    const int* s_count = dst_count->data<int>();
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    s_count);
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    nullptr,
+                                                                    x,
+                                                                    out);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fecbd4b1d7aa05c94d2cb713f302abd6f2aeb1c4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <set>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuLoop(const int& input_size,
+                          const int& index_size,
+                          const IndexT* s_index,
+                          const IndexT* d_index,
+                          const DenseTensor& src,
+                          DenseTensor* dst,
+                          const std::string& pool_type,
+                          int* dst_count = nullptr) {
+  Functor functor;
+  if (pool_type == "SUM") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+    for (int i = 0; i < index_size; ++i) {
+      IndexT dst_idx = d_index[i];
+      *(dst_count + dst_idx) += 1;
+    }
+    for (int i = 0; i < input_size; ++i) {
+      if (*(dst_count + i) == 0) continue;
+      auto dst_slice = dst->Slice(i, i + 1);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
+      } else {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
+      }
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
+                                       const DenseTensor& x,
+                                       const DenseTensor& src_index,
+                                       const DenseTensor& dst_index,
+                                       const std::string& pool_type,
+                                       DenseTensor* out,
+                                       DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MEAN") {
+    ctx.template Alloc<int>(dst_count);
+    int* p_dst_count = dst_count->data<int>();
+    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                index_size,
+                                                                s_index,
+                                                                d_index,
+                                                                x,
+                                                                out,
+                                                                pool_type,
+                                                                p_dst_count);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..006711ceef75edb7d9d3ed2530c0a5dda2b64993
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context, typename IndexT = int>
+void IndexSampleGradInner(const Context& context,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad) {
+  std::vector<T> out_grad_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(out_grad, context, &out_grad_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+
+  auto index_dims = index.dims();
+  auto x_grad_dims = x_grad->dims();
+
+  auto value_length = x_grad_dims[1];
+  auto index_length = index_dims[1];
+  int index_ids_num = index.numel();
+
+  std::vector<T> x_grad_vec(x_grad->numel(), 0);
+
+  for (int i = 0; i < index_ids_num; i++) {
+    int b = floor(i / index_length);
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample_grad) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        value_length,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample_grad) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    int v_i = b * value_length + static_cast<int>(index_vec[i]);
+    x_grad_vec[v_i] += out_grad_vec[i];
+  }
+  context.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(x_grad_vec, context, x_grad);
+  x_grad->Resize(x_grad_dims);
+}
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* x_grad) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  if (index_type == DataType::INT32) {
+    IndexSampleGradInner<T, Context, int>(ctx, out_grad, index, x_grad);
+  } else if (index_type == DataType::INT64) {
+    IndexSampleGradInner<T, Context, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21bf9faee13cfa4da271a7d1b1a9fe482a55da04
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_kernel.h"
+#include <cmath>
+#include <fstream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context, typename IndexT = int>
+void IndexSampleInner(const Context &context,
+                      const DenseTensor &input,
+                      const DenseTensor &index,
+                      DenseTensor *output) {
+  auto input_dims = input.dims();
+  auto index_dims = index.dims();
+
+  int batch_size = input_dims[0];
+  auto value_length = input_dims[1];
+  auto index_length = index_dims[1];
+  int index_ids_num = index.numel();
+
+  std::vector<T> input_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(input, context, &input_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+
+  std::vector<T> res(index_ids_num);
+  for (int i = 0; i < index_ids_num; i++) {
+    int b = floor(i / index_length);
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        value_length,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+
+    int v_i = b * value_length + static_cast<int>(index_vec[i]);
+    T v = input_vec[v_i];
+    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
+            << " value = " << v;
+    res[i] = v;
+  }
+
+  auto ddim = phi::make_ddim({batch_size, index_length});
+  context.template Alloc<T>(output);
+  paddle::framework::TensorFromVector(res, context, output);
+  output->Resize(ddim);
+}
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context &ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &index,
+                       DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  if (index_type == DataType::INT32) {
+    IndexSampleInner<T, Context, int>(ctx, x, index, out);
+  } else if (index_type == DataType::INT64) {
+    IndexSampleInner<T, Context, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33a7429a22a1a84f5a3c372fb5530092e761ed9e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b8b7f7a2e05c741358bac852770c0e3cbaf49ed
--- /dev/null
+++ b/paddle/phi/kernels/cpu/linspace_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  int32_t num = number.data<int32_t>()[0];
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  T start_data = start_t.template data<T>()[0];
+  T stop_data = stop_t.template data<T>()[0];
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (num > 1) {
+    // step should be of double type for all types
+    double step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int half_num = num / 2;
+    for (int i = 0; i < num; ++i) {
+      if (i < half_num) {
+        out_data[i] = static_cast<T>(start_data + step * i);
+      } else {
+        out_data[i] = static_cast<T>(stop_data - step * (num - i - 1));
+      }
+    }
+  } else {
+    out_data[0] = static_cast<T>(start_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e2d94df59eeb8c2eb2ec2a7b5487e48566fbef8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    log_loss_grad, CPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/log_loss_kernel.cc b/paddle/phi/kernels/cpu/log_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38e93486f7bf882a468f2cebb7f6c5757eadd512
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_loss_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(log_loss, CPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0747b128e53899b77767298cab4fa37f31e495a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                                \
+  template <typename T, typename Context>                                 \
+  void Logical##type##Kernel(const Context& dev_ctx,                      \
+                             const DenseTensor& x,                        \
+                             const DenseTensor& y,                        \
+                             DenseTensor* out) {                          \
+    funcs::Logical##type##Functor<T> binary_func;                         \
+    funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
+        dev_ctx, x, y, -1, binary_func, out);                             \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  auto* out_ptr = dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                           \
+                     CPU,                                   \
+                     ALL_LAYOUT,                            \
+                     phi::Logical##func_type##Kernel,       \
+                     float,                                 \
+                     double,                                \
+                     bool,                                  \
+                     int64_t,                               \
+                     int,                                   \
+                     int8_t,                                \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 5cfcfe62c7816c84a4f2876942b4d9b30dfad167..250f656926c0536f71e5724eb9df779c1502a673 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -20,6 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
@@ -45,10 +46,10 @@ namespace phi {
       auto x_dims = x.dims();                                               \
       auto y_dims = y.dims();                                               \
       if (x_dims.size() >= y_dims.size()) {                                 \
-        ElementwiseCompute<funcs::name##Functor<T>, T>(                     \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
             dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
       } else {                                                              \
-        ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(            \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
             dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
       }                                                                     \
     }                                                                       \
@@ -93,10 +94,10 @@ void DivideRawKernel(const Context& dev_ctx,
     auto x_dims = x.dims();
     auto y_dims = y.dims();
     if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
           dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
     } else {
-      ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
           dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
     }
   }
diff --git a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3b4d2b45582b84d81a2a57865c6cc287f86535
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
similarity index 58%
rename from paddle/fluid/operators/optimizers/adadelta_op.cu
rename to paddle/phi/kernels/cpu/matrix_power_kernel.cc
index 562a157f063b44d65254d556d44439eee3636c4c..f40e1e616f526262eee2a50b319935e7ab160bee 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..429344a362b1c3215f6019b70941db0255e304f3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7cd3ab07ff598230338fcb1a3804952a000d14d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_kernel.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cd75404be821ce1f2303237fcb92cae53ee25d1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {}
diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
similarity index 59%
rename from paddle/fluid/platform/dynload/lapack.cc
rename to paddle/phi/kernels/cpu/multi_dot_kernel.cc
index 5a21bb4d041d9b02897b81fd9af8fb58983a7838..a4249a98e46dde19ea0bf91bee73ba7a0d425f28 100644
--- a/paddle/fluid/platform/dynload/lapack.cc
+++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/dynload/lapack.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-LAPACK_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+PD_REGISTER_KERNEL(
+    multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7d74759f516ac1d2ba9538a136d9856c6c7ddb1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T>
+static void nll_loss_grad_1D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const std::string reduction,
+                             const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        continue;
+      }
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      continue;
+    }
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
+    if (reduction == "mean") {
+      dx_data[i * n_classes + cur_label] /= total_weight_val;
+    }
+  }
+}
+
+template <typename T>
+static void nll_loss_grad_2D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const int64_t in_dim2,
+                             const int64_t in_dim3,
+                             const std::string& reduction,
+                             const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            continue;
+          }
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
+              -cur_weight * dout_data[index];
+        }
+      }
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          continue;
+        }
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        const auto dx_index =
+            i * sample_size + cur_label * map_size + h * in_dim3 + w;
+        dx_data[dx_index] = -dout_val * cur_weight;
+        if (reduction == "mean") {
+          dx_data[dx_index] /= total_weight_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = d_out.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+  memset(dx_data, 0, dx->numel() * sizeof(T));
+
+  const auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_grad_1D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     reduction,
+                     ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_grad_2D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     in_dim2,
+                     in_dim3,
+                     reduction,
+                     ignore_index);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, CPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..334b0082bde57fddff67aaeedee5de040ebf7c85
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+static void nll_loss_1D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int64_t i = 0; i < batch_size; ++i) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        out_data[i] = 0;
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes,
+                            cur_label));
+
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      out_data[i] = 0;
+      continue;
+    }
+    PADDLE_ENFORCE_EQ(
+        cur_label >= 0 && cur_label < n_classes,
+        true,
+        phi::errors::InvalidArgument("label should not be out of bounds."));
+
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    total_weight_val += cur_weight;
+    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
+  }
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T>
+static void nll_loss_2D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const int64_t in_dim2,
+                        const int64_t in_dim3,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            out_data[index] = 0;
+            continue;
+          }
+          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                            true,
+                            phi::errors::InvalidArgument(
+                                "label should not be out of bounds."));
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
+                                    h * in_dim3 + w] *
+                            cur_weight;
+        }
+      }
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          out_data[index] = 0;
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(
+            cur_label >= 0 && cur_label < n_classes,
+            true,
+            phi::errors::InvalidArgument("label should not be out of bounds."));
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        total_weight_val += cur_weight;
+        output_val -=
+            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
+            cur_weight;
+      }
+    }
+  }
+
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& labels,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto x_data = x.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  *total_weight_data = 0;
+
+  auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_1D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   reduction,
+                   ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_2D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   in_dim2,
+                   in_dim3,
+                   reduction,
+                   ignore_index);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, CPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index 597207a05a226ac598d9141b42d5682bed5364f1..bd05e2c4c6ec1759b55a6f4fb0e7f83d1aeac954 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -26,9 +26,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& norm,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/cpu/pad_grad_kernel.cc b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67e6da7d0e06a572d6d1b5f5353f3fdecc122eaa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/pad_kernel.cc b/paddle/phi/kernels/cpu/pad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4a0acdcca267050b04f8063147c140e5631e27b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbed3f1cb133ada68b90a5283fc182373488c565
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  if (dx) {
+    auto in_dims = x.dims();
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num_t = rois.dims()[0];
+
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      auto* rois_num_t_data = rois_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_t_data[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_t_data[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+    const T* input_rois = rois.data<T>();
+    const T* dout_data = dout.data<T>();
+    T* dx_data = ctx.template Alloc<T>(dx);
+
+    // set gradient of X to be 0. before backpropagate.
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    // backpropagate gradient per output pixel
+    int dout_size = dout.numel();
+    for (int i = 0; i < dout_size; ++i) {
+      // The output is in order (n, c, ph, pw)
+      int pw = i % pooled_width;
+      int ph = (i / pooled_width) % pooled_height;
+      int c = (i / pooled_width / pooled_height) % output_channels;
+      int n = i / pooled_width / pooled_height / output_channels;
+
+      // set roi_batch_id
+      int roi_batch_id = rois_batch_id_data[n];
+      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      int input_offset =
+          (roi_batch_id * input_channels + input_channel) * height * width;
+      T* offset_dx_data = dx_data + input_offset;
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+      int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+      int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+      int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+      // Add roi offsets and clip to input boundaries
+      hstart = std::min(std::max(hstart, 0), height);
+      hend = std::min(std::max(hend, 0), height);
+      wstart = std::min(std::max(wstart, 0), width);
+      wend = std::min(std::max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Accumulate diff_val into input data
+      T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+      T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+      for (int ih = hstart; ih < hend; ++ih) {
+        for (int iw = wstart; iw < wend; ++iw) {
+          int input_index = ih * width + iw;
+          offset_dx_data[input_index] += diff_val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, CPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06cd03395d9656614995ef0ad91dad04b27717bf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num_t = rois.dims()[0];
+
+  PADDLE_ENFORCE_EQ(input_channels,
+                    output_channels * pooled_height * pooled_width,
+                    errors::InvalidArgument(
+                        "the channels of input "
+                        "X should equal the product of "
+                        "output_channels x pooled_height x pooled_width"));
+
+  auto in_stride = stride(in_dims);
+  auto out_stride = stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+
+  int rois_batch_size;
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            rois_batch_size,
+            batch_size));
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_data[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_data[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_data[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument("the rois_batch_size and input(X) "
+                                "batch_size should be the same."));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod,
+                      rois_num_t,
+                      errors::InvalidArgument(
+                          "the rois_num from input and lod must be the same"));
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = ctx.template Alloc<T>(out);
+  const T* input_rois = rois.data<T>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num_t; ++n) {
+    // set roi batch id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+    // Force too small rois to be 1 x 1
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute bin size w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // calculate each pixel of the output feature map.
+    int out_roi_offset = n * out_stride[0];
+    for (int c = 0; c < output_channels; ++c) {
+      // per category
+      int out_plane_offset = out_roi_offset + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        int out_row_offset = out_plane_offset + ph * out_stride[2];
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          // calculate w and h at input feature map
+          int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+          int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+          int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+          int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+          //  Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, 0), height);
+          wstart = std::min(std::max(wstart, 0), width);
+          hend = std::min(std::max(hend, 0), height);
+          wend = std::min(std::max(wend, 0), width);
+
+          int output_index = out_row_offset + pw;
+          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          int input_plane_offset =
+              roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+          const T* offset_input_data = input_data + input_plane_offset;
+          T out_sum = 0.;
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          for (int ih = hstart; ih < hend; ++ih) {
+            for (int iw = wstart; iw < wend; ++iw) {
+              int input_index = ih * in_stride[2] + iw;
+              out_sum += offset_input_data[input_index];
+            }
+          }
+          T bin_area = (hend - hstart) * (wend - wstart);
+          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, CPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e94d09e0337f27df2ee228b0b10e817a6f192803
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int32_t>(
+          // Here passing an unused argument out_grad, because it's
+          // convenient to instantiate a bunch of template function with the
+          // same arguments list.
+          out_grad,
+          axis,
+          index,
+          *x_grad,
+          dev_ctx);
+    } else {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_gather_kernel<T, int32_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83c9a915ee6357c4462f64b1e193e546973560ce
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce "
+        "op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/cpu/reduce_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..f56d3d3ed50f7e72910115f7ec28914a5eade2e8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_grad.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
+
+namespace phi {
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* input1 = out.get_ptr();
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = x.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+  // NOTE: EigenTensor::From() uses tensor->data()
+  // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
+  // kNoNeedBufferY should set true
+  // and use fake var that has same dims.
+  if (kNoNeedBufferX) {
+    input0 = output;
+  }
+  if (kNoNeedBufferY) {
+    input1 = &input2;
+  }
+
+  const std::vector<int> const_dims{dims.begin(), dims.end()};
+
+  // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
+  // not be set as Input in grad Maker, use Out_grad to replace here
+  if (!input1) input1 = &input2;
+  Functor functor;
+
+  funcs::LaunchReduceGradKernel<Context, T, Functor>(dev_ctx,
+                                                     input0,
+                                                     input1,
+                                                     &input2,
+                                                     output,
+                                                     functor,
+                                                     const_dims,
+                                                     reduce_all);
+}
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  if (in_dtype != DataType::UNDEFINED) {
+    DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+    DenseTensor x_grad_tmp =
+        phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        &x_grad_tmp);
+
+    phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+  } else {
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        x_grad);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9ea0aa0faf06918253f9037282b924199e3a314
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf0179124ebdfcb58a2ac3436fcbd4d5347bb6f2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reduce_prod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efea054555e86be79b5cdb09fe8c4784a1ad0c3b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/cpu/reduce_grad.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+namespace phi {
+
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+template <typename T, typename Context>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  const auto* input2_d = input2.data<T>();
+  auto* output_d = output->data<T>();
+
+  // handle reduce_all
+  if (input2.dims().size() == 1 && input2.dims()[0] == 1) {
+    for (int64_t i = 0; i < phi::product(input0->dims()); ++i) {
+      output_d[i] = input2_d[0];
+    }
+    return;
+  }
+
+  // handle reduce by one dimension
+  int reduce_dim_index = dims[0];
+  if (reduce_dim_index < 0) {
+    reduce_dim_index += input0->dims().size();
+  }
+
+  auto& input_dim = input0->dims();
+  int64_t before_dim = 1;
+  for (int i = 0; i < reduce_dim_index; ++i) {
+    before_dim *= input_dim[i];
+  }
+  int64_t reduce_dim = input_dim[reduce_dim_index];
+  int64_t after_dim = 1;
+  for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+    after_dim *= input_dim[i];
+  }
+  for (int64_t i = 0; i < before_dim; ++i) {
+    for (int64_t j = 0; j < reduce_dim; ++j) {
+      for (int64_t k = 0; k < after_dim; ++k) {
+        output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+            input2_d[i * after_dim + k];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  if (dims.size() == 1) {
+    if (out_dtype != DataType::UNDEFINED) {
+      DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+      DenseTensor x_grad_tmp =
+          phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, &x_grad_tmp);
+
+      phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+
+    } else {
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, x_grad);
+    }
+  }
+
+  ReduceGradKernel<Context, T, SumGradFunctor, true>(dev_ctx,
+                                                     x,
+                                                     out_grad,
+                                                     paddle::none,
+                                                     dims,
+                                                     keep_dim,
+                                                     reduce_all,
+                                                     in_dtype,
+                                                     out_dtype,
+                                                     x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62fd58704c4fef7c23cd8255d6958103b9755bff
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad) {
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s]",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (x_grad) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUScatterGradForX<T, int32_t>(ctx, index, x_grad);
+    } else {
+      phi::funcs::CPUScatterGradForX<T, int64_t>(ctx, index, x_grad);
+    }
+  }
+
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::CPUGather<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d48ceaf29a08c58de6f06746c36f2a8e8725852f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out) {
+  // In place output: Out = X, Out[Ids] = Updates
+  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  // Apply ScatterUpdate: Out[index] = Updates[:]
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s].",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+  if (overwrite) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterAssign<T, int32_t>(ctx, updates, index, out);
+    } else {
+      phi::funcs::ScatterAssign<T, int64_t>(ctx, updates, index, out);
+    }
+  } else {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterAssignAdd<T, int32_t>(ctx, updates, index, out);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int64_t>(ctx, updates, index, out);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    scatter, CPU, ALL_LAYOUT, phi::ScatterKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc143ba8d0e4557f8aaf07a4d4606bbf6c2b4d73
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad) {
+  if (x_grad) {
+    Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  }
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    const auto &index_type = index.dtype();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGatherNd<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::CPUGatherNd<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04ae10f5e8b5d551819a97ea1594140e535e6a12
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out) {
+  // In place output: Out = X
+  Copy(ctx, x, ctx.GetPlace(), true, out);
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::ScatterNdAdd<T, int32_t>(ctx, updates, index, out);
+  } else {
+    phi::funcs::ScatterNdAdd<T, int64_t>(ctx, updates, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..585c27bdcec97e11a68cdc536c829f76c000a8df
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0413457f8177338aa450211539dc16d0880c74c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/selu_grad_kernel.cc b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32101b19132825d77534a55b857c2a169e94e9ac
--- /dev/null
+++ b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    selu_grad, CPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/selu_kernel.cc b/paddle/phi/kernels/cpu/selu_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc5a0616a725d17b8846b066e071ab01809aa655
--- /dev/null
+++ b/paddle/phi/kernels/cpu/selu_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+
+PD_REGISTER_KERNEL(selu, CPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/set_value_kernel.cc b/paddle/phi/kernels/cpu/set_value_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcf278cd94e65189cd891124c4aa3ab81fa4397d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_kernel.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7b4074c70aaa21d4575dd69766bc757271f047e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+void sgd_dense_param_dense_grad_impl(const DenseTensor& param,
+                                     const DenseTensor& learning_rate,
+                                     const DenseTensor& grad,
+                                     DenseTensor* param_out) {
+  const auto sz = param_out->numel();
+  paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+  const T* lr = learning_rate.data<T>();
+  const T* param_data = param.data<T>();
+  const T* grad_data = grad.data<T>();
+  int64_t rows_idx = 0;
+  T* out_data = param_out->data<T>();
+
+  auto sgd =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(attr);
+  sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
+}
+
+template <>
+void sgd_dense_param_dense_grad_impl<phi::dtype::bfloat16>(
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const DenseTensor& grad,
+    DenseTensor* param_out) {
+  auto p = EigenVector<phi::dtype::bfloat16>::Flatten(param);
+  auto g = EigenVector<phi::dtype::bfloat16>::Flatten(grad);
+  auto o = EigenVector<phi::dtype::bfloat16>::Flatten(*param_out);
+  const auto* lr = learning_rate.data<phi::dtype::bfloat16>();
+
+  o = p - lr[0] * g;
+}
+
+template <typename T>
+void sgd_dense_param_sparse_grad_impl(const DenseTensor& param,
+                                      const DenseTensor& learning_rate,
+                                      const SelectedRows& grad,
+                                      DenseTensor* param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const T* param_data = param.data<T>();
+  const T* grad_data = grad_value.data<T>();
+  const T* lr = learning_rate.data<T>();
+  const int64_t* rows_data = grad_rows.data();
+  T* out_data = param_out->data<T>();
+
+  paddle::operators::jit::sgd_attr_t attr;
+  attr.param_height = param_out->dims()[0];
+  attr.param_width = param_out->numel() / attr.param_height;
+  attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+  attr.grad_width = grad_value.numel() / attr.grad_height;
+  attr.selected_rows_size = grad_rows.size();
+
+  auto sgd =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(attr);
+  sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
+}
+
+template <>
+void sgd_dense_param_sparse_grad_impl<phi::dtype::bfloat16>(
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    DenseTensor* param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const auto grad_height = grad.height();
+  const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+  const auto grad_width = grad_value.numel() / grad_val_height;
+
+  const auto* grad_data = grad_value.data<phi::dtype::bfloat16>();
+  auto* out_data = param_out->data<phi::dtype::bfloat16>();
+  const auto* lr = learning_rate.data<phi::dtype::bfloat16>();
+
+  for (size_t i = 0; i < grad_rows.size(); ++i) {
+    PADDLE_ENFORCE_LT(
+        grad_rows[i],
+        grad_height,
+        phi::errors::OutOfRange(
+            "Grad rows index value should be less than grad height."
+            "Got [%s], but expected less than [%s]",
+            grad_rows[i],
+            grad_height));
+    const int64_t row = grad_rows[i];
+    for (int64_t j = 0; j < grad_width; ++j) {
+      out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  sgd_dense_param_dense_grad_impl<T>(param, learning_rate, grad, param_out);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  sgd_dense_param_sparse_grad_impl<T>(param, learning_rate, grad, param_out);
+}
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out) {
+  // for distributed training, a sparse var may be empty,
+  // just skip updating.
+  if (grad.rows().size() == 0) {
+    return;
+  }
+
+  auto param_row_width = param.value().dims()[1];
+  auto grad_row_width = grad.value().dims()[1];
+  PADDLE_ENFORCE_EQ(
+      param_row_width,
+      grad_row_width,
+      phi::errors::InvalidArgument(
+          "The param_row in SgdOP should have the same size with grad_row. "
+          "But received param_row's width is [%s], and grad_row's width is "
+          "[%s]",
+          param_row_width,
+          grad_row_width));
+
+  const auto* lr = learning_rate.data<T>();
+  const auto* grad_data = grad.value().data<T>();
+  auto* out_data = param_out->mutable_value()->data<T>();
+  for (size_t i = 0; i < grad.rows().size(); i++) {
+    int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
+    PADDLE_ENFORCE_GE(
+        id_index,
+        static_cast<int64_t>(0),
+        phi::errors::InvalidArgument(
+            "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+            id_index));
+    for (int64_t j = 0; j < grad_row_width; j++) {
+      out_data[id_index * grad_row_width + j] -=
+          lr[0] * grad_data[i * grad_row_width + j];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sgd,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_sparse_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDSparseParamSparseGradKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/shape_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073dc63b2a4348d4091af8c285f9ddebd799acc5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shape_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shape_kernel.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/shard_index_kernel.cc b/paddle/phi/kernels/cpu/shard_index_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a82bb8ce5929dcae46700dc4bd7013ab1fee0b39
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shard_index_kernel.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(
+      nshards,
+      0,
+      errors::InvalidArgument("The value 'nshard' for Op(shard_index) must be "
+                              "greater than 0, but the value given is %d.",
+                              nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  int shard_size = (index_num + nshards - 1) / nshards;
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    PADDLE_ENFORCE_GE(in_data[i],
+                      0,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be "
+                          "greater or equal to 0, but the value given is %d.",
+                          in_data[i]));
+    PADDLE_ENFORCE_LT(in_data[i],
+                      index_num,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be less "
+                          "than index_num (%d), but the value given is %d.",
+                          index_num,
+                          in_data[i]));
+    if (in_data[i] / shard_size == shard_id) {
+      out_data[i] = in_data[i] % shard_size;
+    } else {
+      out_data[i] = ignore_value;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, CPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..468db18aa216714251c70ecd3d15b59897fb642d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(in_grad);
+
+  int limit = in_grad->numel();
+  auto x_data = x.data<T>();
+  auto label_data = label.data<T>();
+  auto dout_data = out_grad.data<T>();
+  for (int idx = 0; idx < limit; ++idx) {
+    T x = x_data[idx];
+    T label = label_data[idx];
+    T dout = dout_data[idx];
+    if (static_cast<int>(label) == ignore_index) {
+      dx_data[idx] = static_cast<T>(0.);
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+      T diff = simoid_x - label;
+      dx_data[idx] = dout * diff;
+    }
+  }
+  if (normalize) {
+    int norm = 0;
+    T eps = static_cast<T>(1e-6);
+    for (int idx = 0; idx < limit; ++idx) {
+      T diff = label_data[idx] - static_cast<T>(ignore_index);
+      if ((diff < -eps) || (diff > eps)) {
+        norm += 1;
+      }
+    }
+    eps = static_cast<T>(1e-5);
+    norm = norm > eps ? norm : eps;
+    std::for_each(dx_data, dx_data + limit, [norm](T& v) { v = v / norm; });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..366d300320b9fe599982467ba8979fd4f3e90cb9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out) {
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  int limit = out->numel();
+  auto x_data = x.data<T>();
+  auto label_data = label.data<T>();
+  for (int idx = 0; idx < limit; ++idx) {
+    T x = x_data[idx];
+    T label = label_data[idx];
+    if (static_cast<int>(label) == ignore_index) {
+      out_data[idx] = static_cast<T>(0.);
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+      out_data[idx] = term1 - term2 + term3;
+    }
+  }
+
+  if (normalize) {
+    int norm = 0;
+    T eps = static_cast<T>(1e-6);
+    for (int idx = 0; idx < limit; ++idx) {
+      T diff = label_data[idx] - static_cast<T>(ignore_index);
+      if ((diff < -eps) || (diff > eps)) {
+        norm += 1;
+      }
+    }
+    eps = static_cast<T>(1e-5);
+    norm = norm > eps ? norm : eps;
+    std::for_each(out_data, out_data + limit, [norm](T& v) { v = v / norm; });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 722681fb7bc3f9d9f75b92468b89931910dd532e..324798effbe56b8b7bdf0c3d31b21cd079a8cf1c 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -31,11 +31,14 @@ void SplitKernel(const Context& dev_ctx,
   // need to infershape output
   if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
     std::vector<MetaTensor> out_metas;
+    out_metas.reserve(outs.size());
+    std::vector<MetaTensor*> out_metas_ptr;
     for (size_t i = 0; i < outs.size(); ++i) {
       out_metas.push_back(outs[i]);
+      out_metas_ptr.push_back(&out_metas.back());
     }
 
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
 
     for (size_t i = 0; i < out_metas.size(); ++i) {
       outs[i]->Resize(out_metas[i].dims());
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4443383f40262ee3346a3c5e73b4d9dea7031966
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..502db8a22da0bc9ab475243b4d8f646be51ed9d5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/tile_grad_kernel.cc b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..636ade93742da11cf5e875adb27f72930c9a6686
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b590ed475aa2611d75a0426294cf5e8d30ab6fa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    tile, CPU, ALL_LAYOUT, phi::TileKernel, bool, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..582ee1157cce8bb7a9918464e124b5571b12f176
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  if (axis + 1 == in_dims.size()) {
+    // allocate the memory for the input_grad
+
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    FullTopKAssign(input_height,
+                   input_width,
+                   in_dims.size(),
+                   &out_grad,
+                   &indices,
+                   x_grad_data,
+                   k);
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    phi::DDim trans_dims(out_dims);
+    phi::DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    DenseTensor trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+
+    // Do transpose
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, out_grad, &trans_dO, trans);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, indices, &trans_ind, trans);
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    FullTopKAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out,
+                               k);
+
+    // Transpose back
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/top_k_kernel.cc b/paddle/phi/kernels/cpu/top_k_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ac16667ce2741dcdf0a2cea55b8c1c0a2366781
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_kernel.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     const int& k,
+                     const bool& largest,
+                     const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(),
+                    col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(),
+              col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // Get the top k elements of each row of input tensor
+  const auto& in_dims = input->dims();
+
+  // axis < 0, cacluate the real axis
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    auto out_dims = out->dims();
+    // accroding to axis to set K value in the dim
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  const auto& out_dims = out->dims();
+  if (axis + 1 == in_dims.size()) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         input,
+                         out_data,
+                         indices_data,
+                         k,
+                         largest,
+                         sorted);
+  } else {
+    // if the topk dims is not last dim, will tranpose and do topk
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    // get the trans input_dims, out_dims
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+
+    // transpose the input value
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, *input, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    // Allocate the temp tensor to the save the topk indices, values
+    DenseTensor tmp_out;
+    DenseTensor tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    tmp_indices.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    // get the TopK value
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         k,
+                         largest,
+                         sorted);
+    // transpose back
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    top_k, CPU, ALL_LAYOUT, phi::TopkKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dbcf575f33c1a3e881b0260b1e35783553ae7ef
--- /dev/null
+++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a80196e7f80e1b68d55265a1d5061887f12ab6bb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_kernel.h"
+#include <vector>
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+  int rank = axis.size();
+  switch (rank) {
+    case 1:
+      funcs::Transpose<Context, T, 1> trans1;
+      trans1(ctx, x, out, axis);
+      break;
+    case 2:
+      funcs::Transpose<Context, T, 2> trans2;
+      trans2(ctx, x, out, axis);
+      break;
+    case 3:
+      funcs::Transpose<Context, T, 3> trans3;
+      trans3(ctx, x, out, axis);
+      break;
+    case 4:
+      funcs::Transpose<Context, T, 4> trans4;
+      trans4(ctx, x, out, axis);
+      break;
+    case 5:
+      funcs::Transpose<Context, T, 5> trans5;
+      trans5(ctx, x, out, axis);
+      break;
+    case 6:
+      funcs::Transpose<Context, T, 6> trans6;
+      trans6(ctx, x, out, axis);
+      break;
+    default:
+      // for rank >= 7 situation
+      funcs::TransposeNormal<Context, T> trans_normal;
+      trans_normal(ctx, x, out, axis);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80b2015f7318ad9a8b46c77460ca70a17f801a6d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5aca5be12792387659b1c4db00e5d8ed98bc22dc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // Calculate use blas library
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    blas.TRSM(CblasLeft,
+              upper ? CblasUpper : CblasLower,
+              transpose ? CblasTrans : CblasNoTrans,
+              unitriangular ? CblasUnit : CblasNonUnit,
+              M,
+              N,
+              T(1),
+              x_bst_data + i * M * M,
+              std::max(1, M),
+              out_data + i * N * M,
+              std::max(1, N));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
index ebc032ef54538188d8e287673c0d31fae9ad197b..4247e597acef4aac14f93066a3ea6232734e0c8c 100644
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -27,7 +27,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ec1d9683e15a92c7184d91005f85258cf1dd004
--- /dev/null
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+inline void UniformRealDistribution(T *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<float> dist(min, max);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context &dev_ctx,
+                            const ScalarArray &shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor *out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  VLOG(4) << out->dims();
+  T *data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+  UniformRealDistribution<T>(data, size, min, max, engine);
+  if (diag_num > 0) {
+    PADDLE_ENFORCE_GT(
+        size,
+        (diag_num - 1) * (diag_step + 1),
+        phi::errors::InvalidArgument(
+            "ShapeInvalid: the diagonal's elements is equal (num-1) "
+            "* (step-1) with num %d, step %d,"
+            "It should be smaller than %d, but received %d",
+            diag_num,
+            diag_step,
+            (diag_num - 1) * (diag_step + 1),
+            size));
+    for (int64_t i = 0; i < diag_num; ++i) {
+      int64_t pos = i * diag_step + i;
+      data[pos] = diag_val;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context &dev_ctx,
+                         const ScalarArray &shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor *out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fab49f5416048a2655412056e5375b30fdaad923
--- /dev/null
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    int64_t height = pre * post;
+    int64_t width = n;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+// Reduce
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int64_t i = 0; i < height; ++i) {
+      int64_t h = i / post;
+      int64_t w = i % post;
+      IndType max_idx = -1;
+      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
+      for (int64_t j = 0; j < width; ++j) {
+        if (in_data[h * width * post + j * post + w] > max_value) {
+          max_value = in_data[h * width * post + j * post + w];
+          max_idx = j;
+        }
+      }
+      out_data[i] = max_value;
+      out_idx_data[i] = max_idx;
+    }
+  }
+};
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int end,
+                  int64_t scale) {
+    for (int i = 0; i < end; ++i) {
+      data[i] = i * scale;
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    auto input_ptr = input.data<T>();
+    auto num = input.numel();
+    *max_value = *std::max_element(input_ptr, input_ptr + num);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    funcs::SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(
+        lhs, rhs, mask);
+  }
+};
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    if (lhs.dims() == rhs.dims()) {
+      funcs::SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
+    } else {
+      bool is_multi_threads = false;
+#ifdef PADDLE_WITH_MKLML
+      if (omp_get_max_threads() > 1) {
+        is_multi_threads = true;
+      }
+#endif
+      if (is_multi_threads) {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(
+            lhs, rhs, output);
+      } else {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(
+            lhs, rhs, output);
+      }
+    }
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, CPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/where_grad_kernel.cc b/paddle/phi/kernels/cpu/where_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67c8cee1038c7a990e5961a3fcd17e8d7c591207
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_grad_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  const auto* cond_data = condition.data<bool>();
+  auto numel = condition.numel();
+  auto* dout = out_grad.data<T>();
+
+  if (x_grad != nullptr) {
+    auto* dx = ctx.template Alloc<T>(x_grad);
+    for (int i = 0; i < numel; i++) {
+      dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
+    }
+  }
+  if (y_grad != nullptr) {
+    auto* dy = ctx.template Alloc<T>(y_grad);
+    for (int i = 0; i < numel; i++) {
+      dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/where_index_kernel.cc b/paddle/phi/kernels/cpu/where_index_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da6eff74011eaaf5ac5c6d89091743be9a866a5b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_index_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct WhereIndexFunctor {
+  WhereIndexFunctor(
+      const T* true_index, int true_num, const T* stride, int rank, T* out)
+      : true_index_(true_index),
+        true_num_(true_num),
+        stride_(stride),
+        rank_(rank),
+        out_ptr_(out) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T index = true_index_[idx];
+    for (int j = 0; j < rank_; j++) {
+      out_ptr_[idx * rank_ + j] = index / stride_[j];
+      index -= out_ptr_[idx * rank_ + j] * stride_[j];
+    }
+  }
+
+  const T* true_index_;
+  int true_num_;
+  const T* stride_;
+  int rank_;
+  T* out_ptr_;
+};
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  auto true_num = true_index.size();
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto* out_ptr = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+
+  WhereIndexFunctor<int64_t> functor(
+      true_index.data(), true_num, stride.data(), rank, out_ptr);
+  phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx, true_num);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/where_kernel.cc b/paddle/phi/kernels/cpu/where_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f624c13c262296964cef6b98f7d5d26dfc0b7d56
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_kernel.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  const bool* cond_data = condition.data<bool>();
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  auto x_numel = x.numel();
+
+  T* out_data = ctx.template Alloc<T>(out);
+
+  for (int i = 0; i < x_numel; i++) {
+    out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a83bc019fc3af395cedc20edd548b70149a915d5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  auto* imgsize = &img_size;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  const int stride = h * w;
+  const int an_stride = (class_num + 5) * stride;
+
+  DenseTensor anchors_;
+  auto anchors_data =
+      anchors_.mutable_data<int>({an_num * 2}, dev_ctx.GetPlace());
+  std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = imgsize->data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  memset(boxes_data, 0, boxes->numel() * sizeof(T));
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  memset(scores_data, 0, scores->numel() * sizeof(T));
+
+  T box[4];
+  for (int i = 0; i < n; i++) {
+    int img_height = imgsize_data[2 * i];
+    int img_width = imgsize_data[2 * i + 1];
+
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          int obj_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 4, iou_aware);
+          T conf = funcs::sigmoid<T>(input_data[obj_idx]);
+          if (iou_aware) {
+            int iou_idx =
+                funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+            T iou = funcs::sigmoid<T>(input_data[iou_idx]);
+            conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                   pow(iou, static_cast<T>(iou_aware_factor));
+          }
+          if (conf < conf_thresh) {
+            continue;
+          }
+
+          int box_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 0, iou_aware);
+          funcs::GetYoloBox<T>(box,
+                               input_data,
+                               anchors_data,
+                               l,
+                               k,
+                               j,
+                               h,
+                               w,
+                               input_size_h,
+                               input_size_w,
+                               box_idx,
+                               stride,
+                               img_height,
+                               img_width,
+                               scale,
+                               bias);
+          box_idx = (i * box_num + j * stride + k * w + l) * 4;
+          funcs::CalcDetectionBox<T>(
+              boxes_data, box, box_idx, img_height, img_width, clip_bbox);
+
+          int label_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 5, iou_aware);
+          int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+          funcs::CalcLabelScore<T>(scores_data,
+                                   input_data,
+                                   label_idx,
+                                   score_idx,
+                                   class_num,
+                                   conf,
+                                   stride);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, CPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd90c7b8f5eee81b517013069ca9c2b366aa7d13
--- /dev/null
+++ b/paddle/phi/kernels/cumsum_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename Functor, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/depthwise_conv_grad_kernel.h b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53
--- /dev/null
+++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {}  // namespace phi
diff --git a/paddle/phi/kernels/depthwise_conv_kernel.h b/paddle/phi/kernels/depthwise_conv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53
--- /dev/null
+++ b/paddle/phi/kernels/depthwise_conv_kernel.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {}  // namespace phi
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index 38912a5ccc442b6ea5fb484b708754dd706ae706..ae5346080d30df9836ee55852f0d7469a3cb7438 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -20,8 +20,8 @@ namespace phi {
 
 template <typename T, typename Context>
 void DigammaGradKernel(const Context& ctx,
-                       const DenseTensor& out_grad,
                        const DenseTensor& x,
+                       const DenseTensor& out_grad,
                        DenseTensor* x_grad);
 
 }  // namepsace phi
diff --git a/paddle/phi/kernels/dist_grad_kernel.h b/paddle/phi/kernels/dist_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f8d7ff21f2fe42de029829a5cfa67df9e0e42b1
--- /dev/null
+++ b/paddle/phi/kernels/dist_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cb3d6e0e8bef394e3c2808aa9337d784586a5b5
--- /dev/null
+++ b/paddle/phi/kernels/dist_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dot_kernel.h b/paddle/phi/kernels/dot_kernel.h
index 9377fba204bea4afea5d5346ee4ad13bb1730586..9c7703440d8aeea4dd518436a5bb62dac2f12519 100644
--- a/paddle/phi/kernels/dot_kernel.h
+++ b/paddle/phi/kernels/dot_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Dot(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DotInferMeta(x, y, &meta_out);
   DotKernel<T, Context>(dev_ctx, x, y, &dense_out);
diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae3f82056632ddde8968b7468eb16030f0c926f5
--- /dev/null
+++ b/paddle/phi/kernels/dropout_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc9f89e08e17ac6c8113f28be6604d638cd6cd3a
--- /dev/null
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_grad_kernel.h b/paddle/phi/kernels/eigh_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..73df76e676a8b4a7eee570f466f9e94f619c5443
--- /dev/null
+++ b/paddle/phi/kernels/eigh_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGardKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd28752d9298345101d73913e405381c1d47c6c0
--- /dev/null
+++ b/paddle/phi/kernels/eigh_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index a1b296e326f2198a352b7864bcad54d822492d4e..58ae11a9c4256d18dbacf6a40b06b308acaea159 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -64,4 +64,64 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               int axis,
                               DenseTensor* ddout);
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h
index 0b8d95ee94fb5480684023ec6c71698ba06d9c13..f66f4419fd7f5853565d561751d793b8f10c9b46 100644
--- a/paddle/phi/kernels/empty_kernel.h
+++ b/paddle/phi/kernels/empty_kernel.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -34,28 +34,17 @@ void EmptyLikeKernel(const Context& dev_ctx,
                      DataType dtype,
                      DenseTensor* out);
 
-// TODO(chenweihang): the tensor creation method need to be replaced later,
-// all kernel api call Empty here instead of making tensor self
 template <typename Context>
 DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
-  phi::DenseTensor dense_out(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(meta));
+  phi::DenseTensor dense_out;
+  dense_out.set_meta(meta);
+  dev_ctx.Alloc(&dense_out, dense_out.dtype());
   return dense_out;
 }
 
-template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx) {
-  return Empty(dev_ctx,
-               {paddle::experimental::CppTypeToDataType<T>::Type(),
-                {-1},
-                DataLayout::NCHW});
-}
-
 template <typename T, typename Context>
 DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -65,7 +54,7 @@ DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
 
 template <typename T, typename Context>
 DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/erf_grad_kernel.h b/paddle/phi/kernels/erf_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8957fcaf79b9a0a83323682ffa6efaf28f5362bf
--- /dev/null
+++ b/paddle/phi/kernels/erf_grad_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/phi/kernels/erf_kernel.h
similarity index 73%
rename from paddle/fluid/framework/custom_kernel.h
rename to paddle/phi/kernels/erf_kernel.h
index 31084a34413ea4324c69062303ef84621a463aaf..1d5c57d2201c749f541cabe74b112073c2dace06 100644
--- a/paddle/fluid/framework/custom_kernel.h
+++ b/paddle/phi/kernels/erf_kernel.h
@@ -14,13 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace framework {
+namespace phi {
 
-// Load custom kernel lib and register
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_grad_kernel.h b/paddle/phi/kernels/expand_as_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..675e03c42a34732bf313a1a497e067170f828e33
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..971ea32310f3eab1635bfbedaa0298015f50ac2b
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h
index 8b21b8ae40562c979b23e4292a7591d9c6f10cf7..e9e1abffd143324a12809fe784c4138a77352930 100644
--- a/paddle/phi/kernels/eye_kernel.h
+++ b/paddle/phi/kernels/eye_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void EyeKernel(const Context& ctx,
                int64_t num_rows,
                int64_t num_columns,
-               int dtype,
+               DataType dtype,
                DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/flatten_kernel.h b/paddle/phi/kernels/flatten_kernel.h
index de57dcf2e8d3a06b027190aabffb7e7d1d8ebcfe..808af7d9b7beedfc01da7ac53234d9b469c5239f 100644
--- a/paddle/phi/kernels/flatten_kernel.h
+++ b/paddle/phi/kernels/flatten_kernel.h
@@ -40,7 +40,7 @@ DenseTensor Flatten(const Context& dev_ctx,
                     const DenseTensor& x,
                     int start_axis,
                     int stop_axis) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   FlattenInferMeta(x, start_axis, stop_axis, &meta_out);
   FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index c7b1f9af0e3191ec217d2907677ff34edebc551b..41fc96b6db1fae5eb54b24923b68b4491c158d93 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -31,13 +30,6 @@ void FullKernel(const Context& dev_ctx,
                 DataType dtype,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out);
-
 template <typename T, typename Context>
 void FullLikeKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -45,11 +37,23 @@ void FullLikeKernel(const Context& dev_ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void Full(const Context& dev_ctx,
+          const ScalarArray& shape,
+          const Scalar& val,
+          DenseTensor* out) {
+  FullKernel<T, Context>(dev_ctx,
+                         shape,
+                         val,
+                         paddle::experimental::CppTypeToDataType<T>::Type(),
+                         out);
+}
+
 template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
                  const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -61,7 +65,7 @@ template <typename T, typename Context>
 DenseTensor FullLike(const Context& dev_ctx,
                      const DenseTensor& x,
                      const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index aa4fac169200753639c48f5e9b5fa8c3bbfbd33c..e0db7b51f8e04b561afd30b740166cee9fdd6a78 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -1,6 +1,13 @@
 add_subdirectory(eigen)
 add_subdirectory(blas)
 add_subdirectory(lapack)
+add_subdirectory(detail)
 
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(segment_pooling)
+math_library(sequence2batch)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)
 math_library(concat_and_split_functor DEPS dense_tensor)
+math_library(matrix_reduce DEPS dense_tensor)
+math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a36e4e132f41720b6f9fc563026082e21971d96
--- /dev/null
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -0,0 +1,830 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include <type_traits>
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+namespace funcs {
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+
+template <>
+struct Sine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sin(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+
+template <>
+struct Cosine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(cos(static_cast<float>(val)));
+  }
+};
+
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
+
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+
+template <typename T>
+struct Tangent {
+  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
+};
+
+template <>
+struct Tangent<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(tan(static_cast<float>(val)));
+  }
+};
+
+// Tangent'(x) = -Tangent(x)
+template <typename T>
+struct TanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// Tangent(x) = tan(x)
+template <typename T>
+struct TanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Tangent<T>());
+  }
+};
+
+template <typename T>
+struct Sinh {
+  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
+};
+
+template <>
+struct Sinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sinhf(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosh {
+  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
+};
+
+template <>
+struct Cosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(coshf(static_cast<float>(val)));
+  }
+};
+
+// sinh(x) = sinh(x)
+template <typename T>
+struct SinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sinh<T>());
+  }
+};
+
+// cosh(x) = cosh(x)
+template <typename T>
+struct CoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosh<T>());
+  }
+};
+
+// sinh'(x) = cosh(x)
+template <typename T>
+struct SinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosh'(x) = sinh(x)
+template <typename T>
+struct CoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acosh {
+  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
+};
+
+template <>
+struct Acosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acosh(static_cast<float>(val)));
+  }
+};
+
+// Acosh(x) = acosh(x)
+template <typename T>
+struct AcoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acosh<T>());
+  }
+};
+
+// acosh'(x) =  1/sqrt(x^2 - 1)
+template <typename T>
+struct AcoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asinh {
+  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
+};
+
+template <>
+struct Asinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asinh(static_cast<float>(val)));
+  }
+};
+
+// Asinh(x) = asinh(x)
+template <typename T>
+struct AsinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asinh<T>());
+  }
+};
+
+// asinh'(x) =  1/sqrt(x^2 + 1)
+template <typename T>
+struct AsinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atanh {
+  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
+};
+
+template <>
+struct Atanh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atanh(static_cast<float>(val)));
+  }
+};
+
+// Atanh(x) = atanh(x)
+template <typename T>
+struct AtanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atanh<T>());
+  }
+};
+
+// atanh'(x) =  1/(1 - x^2)
+template <typename T>
+struct AtanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluCPUFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
+      return v > static_cast<T>(0) ? v : static_cast<T>(0);
+    });
+  }
+};
+
+template <typename T>
+struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
+      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / (one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..9382b03cf9368cc726235a753a1990baacb60d52
--- /dev/null
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.1 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.1
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+// Aligned vector generates vectorized load/store on CUDA.
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) AlignedVector {
+  T val[Size];
+
+  HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
+  HOSTDEVICE inline T& operator[](int i) { return val[i]; }
+};
+
+template <typename T, int Size>
+HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
+  const AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<const AlignedVector<T, Size>*>(addr);
+  *vec = *addr_vec;
+}
+
+template <typename T, int Size>
+HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
+  AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<AlignedVector<T, Size>*>(addr);
+  *addr_vec = vec;
+}
+
+/*
+* Only the address of input data is the multiplier of 1,2,4, vectorized load
+* with corresponding multiplier-value is possible. Moreover, the maximum length
+* of vectorized load is 128 bits once. Hence, valid length of vectorized load
+* shall be determined under both former constraints.
+*/
+template <typename T>
+int GetVectorizedSize(const T* pointer) {
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value;  // NOLINT
+  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec8 == 0) {
+    /*
+    * Currently, decide to deal with no more than 4 data once while adopting
+    * vectorization load/store, if performance test shows that dealing with
+    * 8 data once in vectorization load/store does get optimized, return code
+    * below can be changed into " return std::min(8, valid_vec_size); " .
+    */
+    return std::min(4, valid_vec_size);
+  } else if (address % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..21ebae8487ffc3588034a8ea5feeab8ac1c47fa8
--- /dev/null
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/bitwise_functors.h b/paddle/phi/kernels/funcs/bitwise_functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..db1fc59f534bcf752d1b010508b4a1adcb097651
--- /dev/null
+++ b/paddle/phi/kernels/funcs/bitwise_functors.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
+  template <typename T>                                                      \
+  struct Bitwise##func##Functor {                                            \
+    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
+  };                                                                         \
+                                                                             \
+  template <>                                                                \
+  struct Bitwise##func##Functor<bool> {                                      \
+    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
+      return a bool_expr b;                                                  \
+    }                                                                        \
+  };
+
+BITWISE_BINARY_FUNCTOR(And, &, &&)
+BITWISE_BINARY_FUNCTOR(Or, |, ||)
+BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
+#undef BITWISE_BINARY_FUNCTOR
+
+template <typename T>
+struct BitwiseNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE T operator()(const T a) const { return ~a; }
+};
+
+template <>
+struct BitwiseNotFunctor<bool> {
+  using ELEM_TYPE = bool;
+  HOSTDEVICE bool operator()(const bool a) const { return !a; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 84a36b849afa1c4cdcc1a0f4d4ada598944a1faa..7634c2462738b2b3bdb622e851723aef23045dfd 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -25,6 +25,8 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+
 struct DimensionsTransform {
   using DimVector = std::vector<int64_t>;
   typedef void (*MergeFunctor)(
@@ -183,8 +185,6 @@ struct DimensionsTransform {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
@@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize(
               "%d-th output tensor`s shape is not.",
               i));
       out_vec_size = std::min(
-          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
-          out_vec_size);
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
     }
   } else {
-    out_vec_size =
-        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
   }
 
   for (auto *in : ins) {
-    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
     in_vec_size = in->dims() == (*outs)[0]->dims()
                       ? std::min(temp_size, in_vec_size)
                       : in_vec_size;
@@ -578,7 +576,41 @@ void BroadcastKernel(const KPDevice &ctx,
   }
 }
 
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  std::vector<const DenseTensor *> ins = {&x, &y};
+  std::vector<DenseTensor *> outs = {z};
+  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
+      dev_ctx, ins, &outs, axis, func);
+}
+
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T>(
+        dev_ctx, x, y, axis, InverseFunctor(), z);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index d5289dcc22cbc546acc4980403e7e4641abe39f1..139341536debf068b82704d5e7d70a3edbe045e0 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -140,5 +140,96 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
   return true;
 }
 
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out [5, 3, 2], which is batch_size of matrix
+static inline std::vector<int64_t> MatrixGetBroadcastBatchPortion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        (x_size == y_size || x_size == 1 || y_size == 1),
+        true,
+        phi::errors::PreconditionNotMet(
+            "The size of tensor x (%d) must match the size of tensor y "
+            "(%d) at non-singleton dimension %d.",
+            x_size,
+            y_size,
+            i));
+
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
+  }
+  return batchPortion;
+}
+
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out shoule be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is
+// batch_size of matrix
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) {
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+
+  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
+  std::vector<int64_t> x_dims_vec_cut(f1, l1);
+
+  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
+  std::vector<int64_t> y_dims_vec_cut(f2, l2);
+
+  std::vector<int64_t> expand_batch_portion =
+      MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_expand_size({expand_batch_portion});
+  x_expand_size.insert(x_expand_size.end(),
+                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
+                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
+
+  std::vector<int64_t> y_expand_size({expand_batch_portion});
+  y_expand_size.insert(y_expand_size.end(),
+                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
+                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
+
+  return std::make_tuple(x_expand_size, y_expand_size);
+}
+
+inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
+  if (s_dims.size() > l_dims.size()) {
+    return GetOutputDims(l_dims, s_dims);
+  }
+  std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
+  for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
+    int64_t s = s_dims[i];
+    int64_t l = l_dims[j];
+    if (s != l) {
+      if (l == 1) {
+        shapes[j] = s;
+      } else if (s != 1) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "The shape of tensor a %s:%d must match shape of tensor b "
+            "%s:%d.",
+            s_dims.to_str(),
+            i,
+            l_dims.to_str(),
+            j));
+      }
+    }
+  }
+  return phi::make_ddim(shapes);
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/compare_functors.h b/paddle/phi/kernels/funcs/compare_functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..569fed7b7fbab90e77c02beecbb18b486c801a4f
--- /dev/null
+++ b/paddle/phi/kernels/funcs/compare_functors.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define COMPARE_FUNCTOR(func_name, op)                           \
+  template <typename InT, typename OutT = bool>                  \
+  struct func_name {                                             \
+    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
+      return static_cast<OutT>(a op b);                          \
+    }                                                            \
+  };
+
+COMPARE_FUNCTOR(LessThanFunctor, <)
+COMPARE_FUNCTOR(LessEqualFunctor, <=)
+COMPARE_FUNCTOR(GreaterThanFunctor, >)
+COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
+#undef COMPARE_FUNCTOR
+
+template <typename InT, typename OutT = bool>
+struct EqualFunctor {
+  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
+    if (std::is_floating_point<InT>::value) {
+      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
+    } else {
+      return static_cast<OutT>(a == b);
+    }
+  }
+};
+
+template <typename InT, typename OutT = bool>
+struct NotEqualFunctor {
+  HOSTDEVICE bool operator()(const InT a, const InT b) const {
+    return !EqualFunctor<InT, OutT>()(a, b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h
index 86dbdd099ecde72e932cc6cfa492486b65c7ebc2..8b292cb5dc52e24f90ca54c2b62b08f42552c10e 100644
--- a/paddle/phi/kernels/funcs/complex_functors.h
+++ b/paddle/phi/kernels/funcs/complex_functors.h
@@ -20,56 +20,12 @@ limitations under the License. */
 #include <type_traits>
 
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/hostdevice.h"
 
 namespace phi {
 namespace funcs {
 
-template <bool B, typename T>
-struct cond {
-  static constexpr bool value = B;
-  using type = T;
-};
-
-template <bool B, typename TrueF, typename FalseF>
-struct eval_if {
-  using type = typename TrueF::type;
-};
-
-template <typename TrueF, typename FalseF>
-struct eval_if<false, TrueF, FalseF> {
-  using type = typename FalseF::type;
-};
-
-template <bool B, typename T, typename F>
-using eval_if_t = typename eval_if<B, T, F>::type;
-
-template <typename Head, typename... Tail>
-struct select {
-  using type = eval_if_t<Head::value, Head, select<Tail...>>;
-};
-
-template <typename T>
-struct select<T> {
-  using type = T;
-};
-
-template <bool B, typename T>
-struct select<cond<B, T>> {
-  // last one had better be true!
-  static_assert(B, "No match select type!");
-  using type = T;
-};
-
-template <typename Head, typename... Tail>
-using select_t = typename select<Head, Tail...>::type;
-
-template <typename T>
-using Real =
-    select_t<cond<std::is_same<T, phi::dtype::complex<float>>::value, float>,
-             cond<std::is_same<T, phi::dtype::complex<double>>::value, double>,
-             T>;
-
 template <typename T, typename RealT>
 using Complex = typename std::enable_if<!std::is_same<T, RealT>::value>::type;
 
@@ -91,9 +47,9 @@ template <typename T, typename Enable = void>
 struct RealFunctor;
 
 template <typename T>
-struct RealFunctor<T, Complex<T, Real<T>>> {
+struct RealFunctor<T, Complex<T, dtype::Real<T>>> {
  public:
-  RealFunctor(const T* input, Real<T>* output, int64_t numel)
+  RealFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -102,7 +58,7 @@ struct RealFunctor<T, Complex<T, Real<T>>> {
 
  private:
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
@@ -110,8 +66,8 @@ template <typename T, typename Enable = void>
 struct ImagFunctor;
 
 template <typename T>
-struct ImagFunctor<T, Complex<T, Real<T>>> {
-  ImagFunctor(const T* input, Real<T>* output, int64_t numel)
+struct ImagFunctor<T, Complex<T, dtype::Real<T>>> {
+  ImagFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -119,7 +75,7 @@ struct ImagFunctor<T, Complex<T, Real<T>>> {
   }
 
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
@@ -127,8 +83,8 @@ template <typename T, typename Enable = void>
 struct AbsFunctor;
 
 template <typename T>
-struct AbsFunctor<T, Complex<T, Real<T>>> {
-  AbsFunctor(const T* input, Real<T>* output, int64_t numel)
+struct AbsFunctor<T, Complex<T, dtype::Real<T>>> {
+  AbsFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -136,12 +92,12 @@ struct AbsFunctor<T, Complex<T, Real<T>>> {
   }
 
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
 template <typename T>
-struct AbsFunctor<T, NoComplex<T, Real<T>>> {
+struct AbsFunctor<T, NoComplex<T, dtype::Real<T>>> {
   AbsFunctor(const T* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
@@ -203,7 +159,10 @@ struct AbsGradCUDAFunctor<phi::dtype::complex<double>> {
 
 template <typename T>
 struct AbsGradFunctor {
-  AbsGradFunctor(const Real<T>* dout, const T* x, T* output, int64_t numel)
+  AbsGradFunctor(const dtype::Real<T>* dout,
+                 const T* x,
+                 T* output,
+                 int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -214,7 +173,7 @@ struct AbsGradFunctor {
     }
   }
 
-  const Real<T>* dout_;
+  const dtype::Real<T>* dout_;
   const T* x_;
   T* output_;
   int64_t numel_;
@@ -334,8 +293,8 @@ template <typename T, typename Enable = void>
 struct RealToComplexFunctor;
 
 template <typename T>
-struct RealToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealToComplexFunctor(const Real<T>* input, T* output, int64_t numel)
+struct RealToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  RealToComplexFunctor(const dtype::Real<T>* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -343,7 +302,7 @@ struct RealToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = 0;
   }
 
-  const Real<T>* input_;
+  const dtype::Real<T>* input_;
   T* output_;
   int64_t numel_;
 };
@@ -352,8 +311,8 @@ template <typename T, typename Enable = void>
 struct ImagToComplexFunctor;
 
 template <typename T>
-struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  ImagToComplexFunctor(const Real<T>* input, T* output, int64_t numel)
+struct ImagToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  ImagToComplexFunctor(const dtype::Real<T>* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -361,7 +320,7 @@ struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = input_[idx];
   }
 
-  const Real<T>* input_;
+  const dtype::Real<T>* input_;
   T* output_;
   int64_t numel_;
 };
@@ -370,9 +329,9 @@ template <typename T, typename Enable = void>
 struct RealImagToComplexFunctor;
 
 template <typename T>
-struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealImagToComplexFunctor(const Real<T>* input_real,
-                           const Real<T>* input_imag,
+struct RealImagToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  RealImagToComplexFunctor(const dtype::Real<T>* input_real,
+                           const dtype::Real<T>* input_imag,
                            T* output,
                            int64_t numel)
       : input_real_(input_real),
@@ -385,8 +344,8 @@ struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = input_imag_[idx];
   }
 
-  const Real<T>* input_real_;
-  const Real<T>* input_imag_;
+  const dtype::Real<T>* input_real_;
+  const dtype::Real<T>* input_imag_;
   T* output_;
   int64_t numel_;
 };
@@ -423,8 +382,8 @@ struct AngleFunctor;
 
 // angel function for complex
 template <typename T>
-struct AngleFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  AngleFunctor(const T* input, phi::funcs::Real<T>* output, int64_t numel)
+struct AngleFunctor<T, phi::funcs::Complex<T, dtype::Real<T>>> {
+  AngleFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -432,13 +391,13 @@ struct AngleFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
   }
 
   const T* input_;
-  phi::funcs::Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
 // angel function for real
 template <typename T>
-struct AngleFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
+struct AngleFunctor<T, phi::funcs::NoComplex<T, dtype::Real<T>>> {
   AngleFunctor(const T* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
@@ -456,25 +415,22 @@ struct AngleGradFunctor;
 
 // angle grad for complex
 template <typename T>
-struct AngleGradFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  AngleGradFunctor(const phi::funcs::Real<T>* dout,
-                   const T* x,
-                   T* dx,
-                   int64_t numel)
+struct AngleGradFunctor<T, phi::funcs::Complex<T, dtype::Real<T>>> {
+  AngleGradFunctor(const dtype::Real<T>* dout, const T* x, T* dx, int64_t numel)
       : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
     if (x_[idx] == T(0)) {
       dx_[idx] = T(0);
     } else {
-      const phi::funcs::Real<T> r_square =
+      const phi::dtype::Real<T> r_square =
           x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
       dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
                    dout_[idx] * x_[idx].real / r_square);
     }
   }
 
-  const phi::funcs::Real<T>* dout_;
+  const phi::dtype::Real<T>* dout_;
   const T* x_;
   T* dx_;
   int64_t numel_;
@@ -482,16 +438,13 @@ struct AngleGradFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
 
 // angle grad for real
 template <typename T>
-struct AngleGradFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
-  AngleGradFunctor(const phi::funcs::Real<T>* dout,
-                   const T* x,
-                   T* dx,
-                   int64_t numel)
+struct AngleGradFunctor<T, phi::funcs::NoComplex<T, dtype::Real<T>>> {
+  AngleGradFunctor(const dtype::Real<T>* dout, const T* x, T* dx, int64_t numel)
       : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
 
-  const phi::funcs::Real<T>* dout_;
+  const dtype::Real<T>* dout_;
   const T* x_;
   T* dx_;
   int64_t numel_;
diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bb2a5fcfb35bac8a14ba236203ecc9fb68736cd
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cpu_vec.h
@@ -0,0 +1,675 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <functional>
+#include <string>
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+
+#define YMM_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define ZMM_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8
+
+template <typename T>
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+template <>
+inline void vec_exp<float>(const int n, const float* x, float* y) {
+  constexpr int small_enough = 128;
+  if (n < small_enough) {
+    for (int i = 0; i < n; ++i) {
+      y[i] = std::exp(x[i]);
+    }
+  } else {
+    paddle::platform::dynload::vsExp(n, x, y);
+  }
+}
+
+template <>
+inline void vec_exp<double>(const int n, const double* x, double* y) {
+  paddle::platform::dynload::vdExp(n, x, y);
+}
+
+template <>
+inline void vec_scal<float>(const int n, const float a, float* x) {
+  paddle::platform::dynload::cblas_sscal(n, a, x, 1);
+}
+
+template <>
+inline void vec_scal<double>(const int n, const double a, double* x) {
+  paddle::platform::dynload::cblas_dscal(n, a, x, 1);
+}
+#endif
+
+// MKL scal only support inplace, choose this if src and dst are not equal
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx>(const int n,
+                                                   const float a,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_scal<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = _mm256_loadu_ps(x + i);     \
+  tmp = _mm256_mul_ps(tmp, scalar); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx2>(const int n,
+                                                    const float a,
+                                                    const float* x,
+                                                    float* y) {
+  vec_scal<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx512f>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+  // TODO(TJ): enable me
+  vec_scal<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_sum(const size_t n, const T* x, T* s) {
+  s[0] = x[0];
+  for (size_t i = 1; i < n; ++i) {
+    s[0] += x[i];
+  }
+}
+
+template <>
+inline void vec_sum<float, paddle::platform::avx>(const size_t n,
+                                                  const float* x,
+                                                  float* s) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sum<float, paddle::platform::isa_any>(n, x, s);
+    return;
+  }
+
+  unsigned int i, end;
+  i = end = 0;
+  s[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = _mm256_setzero_ps();
+  for (i = 0; i < end; i += block) {
+    tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
+  }
+
+  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
+  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
+  _mm_store_ss(
+      s,
+      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
+
+  for (; i < n; i++) {
+    s[0] += x[i];
+  }
+#else
+  vec_sum<float, paddle::platform::isa_any>(n, x, s);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
+  for (size_t i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul<float, paddle::platform::avx>(const size_t n,
+                                                  const float* x,
+                                                  const float* y,
+                                                  float* z) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul<float, paddle::platform::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  for (i = 0; i < end; i += block) {
+    _mm256_storeu_ps(
+        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
+  }
+
+  for (; i < n; i++) {
+    z[i] = x[i] * y[i];
+  }
+#else
+  vec_mul<float, paddle::platform::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
+  z[0] = x[0] * y[0];
+  for (size_t i = 1; i < n; ++i) {
+    z[0] += x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul_reduce<float, paddle::platform::avx>(const size_t n,
+                                                         const float* x,
+                                                         const float* y,
+                                                         float* z) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  z[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = _mm256_setzero_ps();
+  for (i = 0; i < end; i += block) {
+    tmp = _mm256_add_ps(
+        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
+  }
+
+  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
+  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
+  _mm_store_ss(
+      z,
+      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
+
+  for (; i < n; i++) {
+    z[0] += x[i] * y[i];
+  }
+#else
+  vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx2>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
+  vec_bias_sub<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx512f>(const int n,
+                                                           const float a,
+                                                           const float* x,
+                                                           float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx2>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  vec_cross<float, paddle::platform::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx512f>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, paddle::platform::avx>(n, x, y, z, out);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
+  for (size_t i = 0; i < n; ++i) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+}
+
+template <>
+inline void vec_clip<float, paddle::platform::avx>(const size_t n,
+                                                   const float a,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_clip<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  __m256 threshold = _mm256_set1_ps(a);
+
+  for (i = 0; i < end; i += block) {
+    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
+  }
+
+  for (; i < n; i++) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+#else
+  vec_clip<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx2>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
+  vec_add_bias<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx512f>(const int n,
+                                                           const float a,
+                                                           const float* x,
+                                                           float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx>(const int n,
+                                                      const float* x,
+                                                      float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, paddle::platform::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, min);   \
+  tmp = _mm256_min_ps(tmp, max);   \
+  tmp = _mm256_sub_ps(zeros, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest != 0) {
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
+  }
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = _mm256_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(y + i);   \
+  tmp = _mm256_add_ps(ones, tmp); \
+  tmp = _mm256_div_ps(ones, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, paddle::platform::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx2>(const int n,
+                                                       const float* x,
+                                                       float* y) {
+  vec_sigmoid<float, paddle::platform::avx>(n, x, y);
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx512f>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_sigmoid<float, paddle::platform::avx2>(n, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
+}
+
+// TODO(TJ): make relu clip
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx>(const int n,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block * 4) {
+    vec_relu<float, paddle::platform::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, paddle::platform::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx2>(const int n,
+                                                    const float* x,
+                                                    float* y) {
+  vec_relu<float, paddle::platform::avx>(n, x, y);
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx512f>(const int n,
+                                                       const float* x,
+                                                       float* y) {
+  // TODO(TJ): enable me
+  vec_relu<float, paddle::platform::avx2>(n, x, y);
+}
+
+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Expected type should be one of sigmod, relu, tanh, identity. But got "
+        "not support type: %s.",
+        type));
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad7f2aa192ce4385d537f232398352b4f9d0aaa6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/data_type_transform.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+phi::DenseTensor TransDataType(const Context& dev_ctx,
+                               const phi::DenseTensor& x,
+                               DataType dtype) {
+  VLOG(3) << "TransDataType "
+          << "src type:" << x.dtype() << "; dst typoe: " << dtype;
+
+  switch (x.dtype()) {
+    case DataType::FLOAT32:
+      return phi::Cast<float>(dev_ctx, x, dtype);
+    case DataType::FLOAT64:
+      return phi::Cast<double>(dev_ctx, x, dtype);
+    case DataType::INT32:
+      return phi::Cast<int32_t>(dev_ctx, x, dtype);
+    case DataType::INT64:
+      return phi::Cast<int64_t>(dev_ctx, x, dtype);
+    case DataType::FLOAT16:
+      return phi::Cast<phi::dtype::float16>(dev_ctx, x, dtype);
+    case DataType::BFLOAT16:
+      return phi::Cast<phi::dtype::bfloat16>(dev_ctx, x, dtype);
+    case DataType::BOOL:
+      return phi::Cast<bool>(dev_ctx, x, dtype);
+    case DataType::INT16:
+      return phi::Cast<int16_t>(dev_ctx, x, dtype);
+    case DataType::UINT8:
+      return phi::Cast<uint8_t>(dev_ctx, x, dtype);
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          x.dtype()));
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/CMakeLists.txt b/paddle/phi/kernels/funcs/detail/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/operators/math/detail/CMakeLists.txt
rename to paddle/phi/kernels/funcs/detail/CMakeLists.txt
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h
similarity index 75%
rename from paddle/fluid/operators/math/detail/activation_functions.h
rename to paddle/phi/kernels/funcs/detail/activation_functions.h
index 1fac60e7cb8aabee7d7b5f1fe4aeb3d0da0db0ce..475557f16421094f75c36359b5d3c73694e65bc5 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/phi/kernels/funcs/detail/activation_functions.h
@@ -19,9 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 #define SIGMOID_THRESHOLD_MIN -40.0
@@ -132,25 +131,35 @@ struct Active {
 
 #ifdef PADDLE_WITH_CUDA
 
-static DEVICE Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::SigmoidV2<float>,
-    &forward::Relu<float>,    &forward::Tanh<float>,
-    &forward::TanhV2<float>,  &forward::Identity<float>};
+static DEVICE Active<float>::Act kActFloat[] = {&forward::Sigmoid<float>,
+                                                &forward::SigmoidV2<float>,
+                                                &forward::Relu<float>,
+                                                &forward::Tanh<float>,
+                                                &forward::TanhV2<float>,
+                                                &forward::Identity<float>};
 
 static DEVICE Active<float>::ActGrad kActGradFloat[] = {
-    &backward::Sigmoid<float>, &backward::Sigmoid<float>,
-    &backward::Relu<float>,    &backward::Tanh<float>,
-    &backward::Tanh<float>,    &backward::Identity<float>};
-
-static DEVICE Active<double>::Act kActDouble[] = {
-    &forward::Sigmoid<double>, &forward::SigmoidV2<double>,
-    &forward::Relu<double>,    &forward::Tanh<double>,
-    &forward::TanhV2<double>,  &forward::Identity<double>};
+    &backward::Sigmoid<float>,
+    &backward::Sigmoid<float>,
+    &backward::Relu<float>,
+    &backward::Tanh<float>,
+    &backward::Tanh<float>,
+    &backward::Identity<float>};
+
+static DEVICE Active<double>::Act kActDouble[] = {&forward::Sigmoid<double>,
+                                                  &forward::SigmoidV2<double>,
+                                                  &forward::Relu<double>,
+                                                  &forward::Tanh<double>,
+                                                  &forward::TanhV2<double>,
+                                                  &forward::Identity<double>};
 
 static DEVICE Active<double>::ActGrad kActGradDouble[] = {
-    &backward::Sigmoid<double>, &backward::Sigmoid<double>,
-    &backward::Relu<double>,    &backward::Tanh<double>,
-    &backward::Tanh<double>,    &backward::Identity<double>};
+    &backward::Sigmoid<double>,
+    &backward::Sigmoid<double>,
+    &backward::Relu<double>,
+    &backward::Tanh<double>,
+    &backward::Tanh<double>,
+    &backward::Identity<double>};
 
 namespace forward {
 inline DEVICE float activation(float a, int index) {
@@ -287,13 +296,19 @@ __m256 Identity(const __m256 a, const __m256 b);
 }  // namespace avx
 }  // namespace backward
 
-static Active<__m256>::Act kActAvx[] = {
-    &forward::avx::Sigmoid, &forward::avx::SigmoidV2, &forward::avx::Relu,
-    &forward::avx::Tanh,    &forward::avx::TanhV2,    &forward::avx::Identity};
+static Active<__m256>::Act kActAvx[] = {&forward::avx::Sigmoid,
+                                        &forward::avx::SigmoidV2,
+                                        &forward::avx::Relu,
+                                        &forward::avx::Tanh,
+                                        &forward::avx::TanhV2,
+                                        &forward::avx::Identity};
 
-static Active<__m256>::ActGrad kActGradAvx[] = {
-    &backward::avx::Sigmoid, &backward::avx::Sigmoid, &backward::avx::Relu,
-    &backward::avx::Tanh,    &backward::avx::Tanh,    &backward::avx::Identity};
+static Active<__m256>::ActGrad kActGradAvx[] = {&backward::avx::Sigmoid,
+                                                &backward::avx::Sigmoid,
+                                                &backward::avx::Relu,
+                                                &backward::avx::Tanh,
+                                                &backward::avx::Tanh,
+                                                &backward::avx::Identity};
 
 namespace forward {
 inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
@@ -308,6 +323,5 @@ inline __m256 activation(__m256 a, __m256 b, int index) {
 #endif
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/phi/kernels/funcs/detail/avx_functions.cc
similarity index 87%
rename from paddle/fluid/operators/math/detail/avx_functions.cc
rename to paddle/phi/kernels/funcs/detail/avx_functions.cc
index 89e2c825c24d7b9bad79186649f356652d5c71aa..51af97857dfe9cf8f312879c1f199ce7b861a96a 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/phi/kernels/funcs/detail/avx_functions.cc
@@ -14,12 +14,11 @@ limitations under the License. */
 
 #ifdef __AVX__
 
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/detail/avx_mathfun.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 __m256 Exp(__m256 a) { return exp256_ps(a); }
@@ -77,8 +76,9 @@ namespace backward {
 namespace avx {
 __m256 Relu(const __m256 a, const __m256 b) {
   return _mm256_mul_ps(
-      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                       _mm256_set1_ps(1.0f)));
+      a,
+      _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
+                    _mm256_set1_ps(1.0f)));
 }
 
 __m256 Sigmoid(const __m256 a, const __m256 b) {
@@ -96,8 +96,7 @@ __m256 Identity(const __m256 a, const __m256 b) { return a; }
 }  // namespace backward
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
similarity index 99%
rename from paddle/fluid/operators/math/detail/avx_mathfun.h
rename to paddle/phi/kernels/funcs/detail/avx_mathfun.h
index d7cf91134e4553dfcd935a31993e06dfa74650ac..e5e7388d51dff86790b2a4ed3fb48558d01deb7d 100644
--- a/paddle/fluid/operators/math/detail/avx_mathfun.h
+++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
@@ -49,9 +49,9 @@ typedef __m256 v8sf;   // vector of 8 float (avx)
 typedef __m256i v8si;  // vector of 8 int   (avx)
 typedef __m128i v4si;  // vector of 8 int   (avx)
 
-#define _PI32AVX_CONST(Name, Val)                                          \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \
-                                                                 Val, Val}
+#define _PI32AVX_CONST(Name, Val)                                 \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+      Val, Val, Val, Val}
 
 _PI32AVX_CONST(1, 1);
 _PI32AVX_CONST(inv1, ~1);
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
similarity index 60%
rename from paddle/fluid/operators/math/detail/gru_cpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
index cbbfbc321b56644d165fa0fe8462ed4a8e7aeca8..cb37daa680e0d5aaa0ceb579788f5aac69e881a1 100644
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -16,24 +16,28 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 using Array1 = Eigen::DSizes<int64_t, 1>;
-template <typename T, int MajorType = Eigen::RowMajor,
+template <typename T,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
 
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group for GRU CPU
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(
-    OpResetOutput op_reset_output, T *gate_value, T *reset_output_value,
-    const T *prev_output_value, int frame_size, ActivationType active_gate,
-    bool old_version = true, const T *reset_bias = nullptr) {
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value,
+                                       T *reset_output_value,
+                                       const T *prev_output_value,
+                                       int frame_size,
+                                       ActivationType active_gate,
+                                       bool old_version = true,
+                                       const T *reset_bias = nullptr) {
   T r_value_update_gate;
   T r_value_reset_gate;
   T r_value_reset_output;
@@ -59,8 +63,12 @@ void hl_naive_gru_forward_reset_output(
       r_prev_out = prev_output_value[i];
     }
 
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate, &r_reset_bias,
+    op_reset_output(&r_value_update_gate,
+                    &r_value_reset_gate,
+                    &r_prev_out,
+                    &r_value_reset_output,
+                    active_gate,
+                    &r_reset_bias,
                     old_version);
 
     update_gate[i] = r_value_update_gate;
@@ -70,10 +78,14 @@ void hl_naive_gru_forward_reset_output(
 }
 
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(
-    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
-    T *output_value, int frame_size, ActivationType active_node,
-    bool origin_mode, bool old_version = true) {
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value,
+                                       const T *prev_output_value,
+                                       T *output_value,
+                                       int frame_size,
+                                       ActivationType active_node,
+                                       bool origin_mode,
+                                       bool old_version = true) {
   T r_value_update_gate;
   T r_value_frame_state;
   T r_prev_out = 0;
@@ -93,8 +105,12 @@ void hl_naive_gru_forward_final_output(
       r_prev_out = prev_output_value[i];
     }
 
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node, origin_mode);
+    op_final_output(&r_value_update_gate,
+                    &r_value_frame_state,
+                    &r_prev_out,
+                    &r_output,
+                    active_node,
+                    origin_mode);
 
     frame_state[i] = r_value_frame_state;
     output_value[i] = r_output;
@@ -103,8 +119,10 @@ void hl_naive_gru_forward_final_output(
 
 template <class OpResetOutput, typename T>
 void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                     T *gate_value, T *reset_output_value,
-                                     const T *prev_output_value, int frame_size,
+                                     T *gate_value,
+                                     T *reset_output_value,
+                                     const T *prev_output_value,
+                                     int frame_size,
                                      ActivationType active_gate,
                                      bool old_version = true,
                                      const T *reset_bias = nullptr) {
@@ -152,8 +170,12 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
           _mm256_loadu_ps((const float *)(reset_output_value + i));
     }
 
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate, &r_reset_bias,
+    op_reset_output(&r_value_update_gate,
+                    &r_value_reset_gate,
+                    &r_prev_out,
+                    &r_value_reset_output,
+                    active_gate,
+                    &r_reset_bias,
                     old_version);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
@@ -167,9 +189,13 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
   if (rest > 0) {
     i = n - block;
 
-    op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last,
-                    &r_prev_out_last, &r_value_reset_output, active_gate,
-                    &r_reset_bias, old_version);
+    op_reset_output(&r_value_update_gate_last,
+                    &r_value_reset_gate_last,
+                    &r_prev_out_last,
+                    &r_value_reset_output,
+                    active_gate,
+                    &r_reset_bias,
+                    old_version);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
                      r_value_update_gate_last);
@@ -183,8 +209,10 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
 
 template <class OpFinalOutput, typename T>
 void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
-                                     T *gate_value, const T *prev_output_value,
-                                     T *output_value, int frame_size,
+                                     T *gate_value,
+                                     const T *prev_output_value,
+                                     T *output_value,
+                                     int frame_size,
                                      ActivationType active_node,
                                      bool origin_mode,
                                      bool old_version = true) {
@@ -226,8 +254,12 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
       r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
     }
 
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node, origin_mode);
+    op_final_output(&r_value_update_gate,
+                    &r_value_frame_state,
+                    &r_prev_out,
+                    &r_output,
+                    active_node,
+                    origin_mode);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                      r_value_frame_state);
@@ -236,8 +268,12 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 
   if (rest > 0) {
     i = n - block;
-    op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
-                    &r_prev_out_last, &r_output, active_node, origin_mode);
+    op_final_output(&r_value_update_gate_last,
+                    &r_value_frame_state_last,
+                    &r_prev_out_last,
+                    &r_output,
+                    active_node,
+                    origin_mode);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                      r_value_frame_state_last);
@@ -248,8 +284,10 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 }
 
 template <typename T>
-inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
-                                   GRUMetaValue<T> value, int frame_size) {
+inline void forward_reset_outputV2(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size) {
   auto &place = *context.eigen_device();
   auto value_reset_gate =
       typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
@@ -259,17 +297,23 @@ inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
       value.reset_output_value, Array1(frame_size));
   auto value_reset_bias =
       typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
-  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
-  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
+  paddle::operators::SigmoidFunctor<T>()(
+      place, value_reset_gate, value_reset_gate);
+  paddle::operators::SigmoidFunctor<T>()(
+      place, value_update_gate, value_update_gate);
   value_reset_output.device(place) =
       (value_reset_output + value_reset_bias) * value_reset_gate;
 }
 
 template <class OpResetOutput, typename T>
 inline void forward_reset_output(
-    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
-    int batch_size, ActivationType active_gate, bool old_version = true,
-    const platform::CPUDeviceContext *context = nullptr) {
+    OpResetOutput op_reset_output,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size,
+    int batch_size,
+    ActivationType active_gate,
+    bool old_version = true,
+    const paddle::platform::CPUDeviceContext *context = nullptr) {
   for (int b = 0; b < batch_size; b++) {
     if (!old_version) {
       // use eigen
@@ -277,15 +321,23 @@ inline void forward_reset_output(
     } else {
       if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
           (sizeof(T) == 4)) {
-        hl_avx_gru_forward_reset_output(
-            op_reset_output, value.gate_value, value.reset_output_value,
-            value.prev_out_value, frame_size, active_gate, old_version,
-            value.reset_bias);
+        hl_avx_gru_forward_reset_output(op_reset_output,
+                                        value.gate_value,
+                                        value.reset_output_value,
+                                        value.prev_out_value,
+                                        frame_size,
+                                        active_gate,
+                                        old_version,
+                                        value.reset_bias);
       } else {
-        hl_naive_gru_forward_reset_output(
-            op_reset_output, value.gate_value, value.reset_output_value,
-            value.prev_out_value, frame_size, active_gate, old_version,
-            value.reset_bias);
+        hl_naive_gru_forward_reset_output(op_reset_output,
+                                          value.gate_value,
+                                          value.reset_output_value,
+                                          value.prev_out_value,
+                                          frame_size,
+                                          active_gate,
+                                          old_version,
+                                          value.reset_bias);
       }
     }
     value.gate_value += frame_size * 3;
@@ -297,8 +349,10 @@ inline void forward_reset_output(
 }
 
 template <typename T>
-inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
-                                   GRUMetaValue<T> value, int frame_size) {
+inline void forward_final_outputV2(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size) {
   auto &place = *context.eigen_device();
   auto value_update_gate = typename EigenVector<T>::Type(
       value.gate_value + frame_size, Array1(frame_size));
@@ -306,7 +360,8 @@ inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
       value.gate_value + 2 * frame_size, Array1(frame_size));
   auto value_output =
       typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
-  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
+  paddle::operators::TanhFunctor<T>()(
+      place, value_frame_state, value_frame_state);
   value_output.device(place) =
       (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
   if (value.prev_out_value) {
@@ -319,10 +374,14 @@ inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
 
 template <class OpFinalOutput, typename T>
 inline void forward_final_output(
-    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
-    int batch_size, ActivationType active_node, bool origin_mode,
+    OpFinalOutput op_final_output,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size,
+    int batch_size,
+    ActivationType active_node,
+    bool origin_mode,
     bool old_version = true,
-    const platform::CPUDeviceContext *context = nullptr) {
+    const paddle::platform::CPUDeviceContext *context = nullptr) {
   for (int b = 0; b < batch_size; b++) {
     if (!old_version) {
       // eigen
@@ -330,15 +389,23 @@ inline void forward_final_output(
     } else {
       if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
           (sizeof(T) == 4)) {
-        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+        hl_avx_gru_forward_final_output(op_final_output,
+                                        value.gate_value,
                                         value.prev_out_value,
-                                        value.output_value, frame_size,
-                                        active_node, origin_mode, old_version);
+                                        value.output_value,
+                                        frame_size,
+                                        active_node,
+                                        origin_mode,
+                                        old_version);
       } else {
-        hl_naive_gru_forward_final_output(
-            op_final_output, value.gate_value, value.prev_out_value,
-            value.output_value, frame_size, active_node, origin_mode,
-            old_version);
+        hl_naive_gru_forward_final_output(op_final_output,
+                                          value.gate_value,
+                                          value.prev_out_value,
+                                          value.output_value,
+                                          frame_size,
+                                          active_node,
+                                          origin_mode,
+                                          old_version);
       }
     }
     value.gate_value += frame_size * 3;
@@ -350,9 +417,12 @@ inline void forward_final_output(
 }
 
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                      T *gate_grad, const T *prev_out_value,
-                                      T *prev_out_grad, T *output_grad,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad,
+                                      T *gate_value,
+                                      T *gate_grad,
+                                      const T *prev_out_value,
+                                      T *prev_out_grad,
+                                      T *output_grad,
                                       int frame_size,
                                       ActivationType active_node,
                                       bool origin_mode) {
@@ -379,9 +449,15 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
+    op_state_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_frame_state_value,
+                  &r_frame_state_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_out_grad,
+                  active_node,
+                  origin_mode);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -392,9 +468,12 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
 }
 
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                      T *gate_grad, const T *prev_out_value,
-                                      T *prev_out_grad, T *reset_output_grad,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad,
+                                      T *gate_value,
+                                      T *gate_grad,
+                                      const T *prev_out_value,
+                                      T *prev_out_grad,
+                                      T *reset_output_grad,
                                       int frame_size,
                                       ActivationType active_gate) {
   T r_update_gate_value;
@@ -424,9 +503,14 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_reset_gate_value,
+                  &r_reset_gate_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_reset_output_grad,
+                  active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
@@ -437,10 +521,14 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
 }
 
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                    T *gate_grad, const T *prev_out_value,
-                                    T *prev_out_grad, T *output_grad,
-                                    int frame_size, ActivationType active_node,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad,
+                                    T *gate_value,
+                                    T *gate_grad,
+                                    const T *prev_out_value,
+                                    T *prev_out_grad,
+                                    T *output_grad,
+                                    int frame_size,
+                                    ActivationType active_node,
                                     bool origin_mode) {
 #ifdef __AVX__
   __m256 r_update_gate_value;
@@ -468,9 +556,15 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
+    op_state_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_frame_state_value,
+                  &r_frame_state_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_out_grad,
+                  active_node,
+                  origin_mode);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -482,9 +576,12 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
 }
 
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                    T *gate_grad, const T *prev_out_value,
-                                    T *prev_out_grad, T *reset_output_grad,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad,
+                                    T *gate_value,
+                                    T *gate_grad,
+                                    const T *prev_out_value,
+                                    T *prev_out_grad,
+                                    T *reset_output_grad,
                                     int frame_size,
                                     ActivationType active_gate) {
 #ifdef __AVX__
@@ -516,9 +613,14 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_reset_gate_value,
+                  &r_reset_gate_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_reset_output_grad,
+                  active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
@@ -530,11 +632,16 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
 }
 
 template <class OpGruGrad, typename T>
-inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
-                                  T *gate_grad, const T *prev_out_value,
-                                  T *prev_out_grad, T *reset_output_value,
-                                  T *reset_output_grad, T *output_grad,
-                                  int frame_size, ActivationType active_node,
+inline void hl_naive_gru_backward(OpGruGrad op_gru_grad,
+                                  T *gate_value,
+                                  T *gate_grad,
+                                  const T *prev_out_value,
+                                  T *prev_out_grad,
+                                  T *reset_output_value,
+                                  T *reset_output_grad,
+                                  T *output_grad,
+                                  int frame_size,
+                                  ActivationType active_node,
                                   ActivationType active_gate) {
   T r_value_reset_gate;
   T r_grad_reset_gate;
@@ -573,10 +680,18 @@ inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
       r_grad_reset_output = reset_output_grad[i];
     }
 
-    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
-                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
-                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
-                &r_value_reset_output, &r_grad_reset_output, active_node,
+    op_gru_grad(&r_value_reset_gate,
+                &r_grad_reset_gate,
+                &r_value_update_gate,
+                &r_grad_update_gate,
+                &r_value_frame_state,
+                &r_grad_frame_state,
+                &r_value_prev_out,
+                &r_grad_prev_out,
+                &r_grad_output,
+                &r_value_reset_output,
+                &r_grad_reset_output,
+                active_node,
                 active_gate);
 
     reset_gate_grad[i] = r_grad_reset_gate;
@@ -592,11 +707,16 @@ inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
 }
 
 template <class OpGruGrad, typename T>
-inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
-                                T *gate_grad, const T *prev_out_value,
-                                T *prev_out_grad, T *reset_output_value,
-                                T *reset_output_grad, T *output_grad,
-                                int frame_size, ActivationType active_node,
+inline void hl_avx_gru_backward(OpGruGrad op_gru_grad,
+                                T *gate_value,
+                                T *gate_grad,
+                                const T *prev_out_value,
+                                T *prev_out_grad,
+                                T *reset_output_value,
+                                T *reset_output_grad,
+                                T *output_grad,
+                                int frame_size,
+                                ActivationType active_node,
                                 ActivationType active_gate) {
 #ifdef __AVX__
   __m256 r_value_reset_gate;
@@ -639,10 +759,18 @@ inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
       r_grad_reset_output = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
     }
 
-    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
-                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
-                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
-                &r_value_reset_output, &r_grad_reset_output, active_node,
+    op_gru_grad(&r_value_reset_gate,
+                &r_grad_reset_gate,
+                &r_value_update_gate,
+                &r_grad_update_gate,
+                &r_value_frame_state,
+                &r_grad_frame_state,
+                &r_value_prev_out,
+                &r_grad_prev_out,
+                &r_grad_output,
+                &r_value_reset_output,
+                &r_grad_reset_output,
+                active_node,
                 active_gate);
 
     reset_gate_grad[i] = r_grad_reset_gate;
@@ -660,20 +788,33 @@ inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
 
 template <class OpStateGrad, typename T>
 inline void backward_state_grad(OpStateGrad op_state_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
-                                ActivationType active_node, bool origin_mode) {
+                                phi::funcs::GRUMetaValue<T> value,
+                                phi::funcs::GRUMetaGrad<T> grad,
+                                int frame_size,
+                                int batch_size,
+                                ActivationType active_node,
+                                bool origin_mode) {
   for (int b = 0; b < batch_size; b++) {
     if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value,
-                                     grad.gate_grad, value.prev_out_value,
-                                     grad.prev_out_grad, grad.output_grad,
-                                     frame_size, active_node, origin_mode);
+      hl_avx_gru_backward_state_grad(op_state_grad,
+                                     value.gate_value,
+                                     grad.gate_grad,
+                                     value.prev_out_value,
+                                     grad.prev_out_grad,
+                                     grad.output_grad,
+                                     frame_size,
+                                     active_node,
+                                     origin_mode);
     } else {
-      hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value,
-                                       grad.gate_grad, value.prev_out_value,
-                                       grad.prev_out_grad, grad.output_grad,
-                                       frame_size, active_node, origin_mode);
+      hl_naive_gru_backward_state_grad(op_state_grad,
+                                       value.gate_value,
+                                       grad.gate_grad,
+                                       value.prev_out_value,
+                                       grad.prev_out_grad,
+                                       grad.output_grad,
+                                       frame_size,
+                                       active_node,
+                                       origin_mode);
     }
 
     value.gate_value += frame_size * 3;
@@ -691,18 +832,30 @@ inline void backward_state_grad(OpStateGrad op_state_grad,
 
 template <class OpResetGrad, typename T>
 inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
+                                phi::funcs::GRUMetaValue<T> value,
+                                phi::funcs::GRUMetaGrad<T> grad,
+                                int frame_size,
+                                int batch_size,
                                 ActivationType active_gate) {
   for (int b = 0; b < batch_size; b++) {
     if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+      hl_avx_gru_backward_reset_grad(op_reset_grad,
+                                     value.gate_value,
+                                     grad.gate_grad,
+                                     value.prev_out_value,
+                                     grad.prev_out_grad,
+                                     grad.reset_output_grad,
+                                     frame_size,
+                                     active_gate);
     } else {
-      hl_naive_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+      hl_naive_gru_backward_reset_grad(op_reset_grad,
+                                       value.gate_value,
+                                       grad.gate_grad,
+                                       value.prev_out_value,
+                                       grad.prev_out_grad,
+                                       grad.reset_output_grad,
+                                       frame_size,
+                                       active_gate);
     }
 
     value.gate_value += frame_size * 3;
@@ -719,8 +872,9 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
 }
 
 template <typename T>
-inline void gru_backward(const platform::CPUDeviceContext &context,
-                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+inline void gru_backward(const paddle::platform::CPUDeviceContext &context,
+                         phi::funcs::GRUMetaValue<T> value,
+                         phi::funcs::GRUMetaGrad<T> grad,
                          int frame_size) {
   auto &place = *context.eigen_device();
 
@@ -747,13 +901,19 @@ inline void gru_backward(const platform::CPUDeviceContext &context,
   if (value.prev_out_value) {
     auto value_prev_out = typename EigenVector<T>::ConstType(
         value.prev_out_value, Array1(frame_size));
-    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
-                            (value_prev_out - value_frame_state) * grad_output,
-                            grad_update_gate);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place,
+        1 /*useless*/,
+        value_update_gate,
+        (value_prev_out - value_frame_state) * grad_output,
+        grad_update_gate);
   } else {
-    SigmoidGradFunctor<T>()(
-        place, 1 /*useless*/, value_update_gate,
-        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place,
+        1 /*useless*/,
+        value_update_gate,
+        static_cast<T>(-1) * value_frame_state * grad_output,
+        grad_update_gate);
   }
   if (grad.prev_out_grad) {
     auto grad_prev_out =
@@ -761,11 +921,16 @@ inline void gru_backward(const platform::CPUDeviceContext &context,
     grad_prev_out.device(place) =
         grad_prev_out + grad_output * value_update_gate;
   }
-  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
-                       grad_output * (static_cast<T>(1.0) - value_update_gate),
-                       grad_frame_state);
-  SigmoidGradFunctor<T>()(
-      place, 1 /*useless*/, value_reset_gate,
+  paddle::operators::TanhGradFunctor<T>()(
+      place,
+      1 /*useless*/,
+      value_frame_state,
+      grad_output * (static_cast<T>(1.0) - value_update_gate),
+      grad_frame_state);
+  paddle::operators::SigmoidGradFunctor<T>()(
+      place,
+      1 /*useless*/,
+      value_reset_gate,
       value_reset_output / value_reset_gate * grad_frame_state,
       grad_reset_gate);
   if (value.prev_out_value && grad.prev_out_grad) {
@@ -774,10 +939,13 @@ inline void gru_backward(const platform::CPUDeviceContext &context,
 }
 
 template <class OpGruGrad, typename T>
-inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
-                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
-                             GRUMetaGrad<T> grad, int frame_size,
-                             int batch_size, ActivationType active_node,
+inline void cpu_gru_backward(const paddle::platform::CPUDeviceContext &context,
+                             OpGruGrad op_gru_grad,
+                             phi::funcs::GRUMetaValue<T> value,
+                             phi::funcs::GRUMetaGrad<T> grad,
+                             int frame_size,
+                             int batch_size,
+                             ActivationType active_node,
                              ActivationType active_gate) {
   for (int b = 0; b < batch_size; ++b) {
     // eigen
@@ -801,6 +969,5 @@ inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
 #endif  // @} End Group for GRU CPU
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
similarity index 74%
rename from paddle/fluid/operators/math/detail/gru_gpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index 75d4809a462cb776083ef16a60bd0f45bad6c33d..6657417beac8d8b1a7b74f551c6497162f80633e 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -14,14 +14,13 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 /*
@@ -30,9 +29,11 @@ namespace detail {
  */
 template <class OpResetOutput, bool is_batch, typename T>
 __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
-                                        T *gate_value, T *reset_output_value,
+                                        T *gate_value,
+                                        T *reset_output_value,
                                         const T *prev_output_value,
-                                        int frame_size, int batch_size,
+                                        int frame_size,
+                                        int batch_size,
                                         ActivationType active_gate) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
@@ -55,8 +56,11 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                  &r_value_reset_output, active_gate);
+  op_reset_output(&r_value_update_gate,
+                  &r_value_reset_gate,
+                  &r_prev_out,
+                  &r_value_reset_output,
+                  active_gate);
 
   gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
   gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
@@ -68,10 +72,14 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
  * grid(frame_blocks, batch_blocks)
  */
 template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(
-    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
-    T *output_value, int frame_size, int batch_size, ActivationType active_node,
-    bool origin_mode) {
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value,
+                                        const T *prev_output_value,
+                                        T *output_value,
+                                        int frame_size,
+                                        int batch_size,
+                                        ActivationType active_node,
+                                        bool origin_mode) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
   int batch_idx = 0;
@@ -92,8 +100,12 @@ __global__ void KeGruForwardFinalOutput(
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                  &r_output, active_node, origin_mode);
+  op_final_output(&r_value_update_gate,
+                  &r_value_frame_state,
+                  &r_prev_out,
+                  &r_output,
+                  active_node,
+                  origin_mode);
 
   gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
   output_value[frame_idx] = r_output;
@@ -106,7 +118,8 @@ __global__ void KeGruForwardFinalOutput(
 template <class T, int Tiled_size>
 __global__ void KeFastCollectiveGruGate(T *gate_value,
                                         const T *prev_output_value,
-                                        const T *gate_weight, T *reset_output,
+                                        const T *gate_weight,
+                                        T *reset_output,
                                         int frame_size,
                                         ActivationType active_node) {
   T xt_0 = 0.0f;
@@ -164,9 +177,12 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
  */
 template <class T, int Tiled_size>
 __global__ void KeFastCollectiveGruOut(const T *gate_weight,
-                                       const T *prev_out_value, T *output_value,
-                                       T *gate_value, T *reset_value,
-                                       int frame_size, ActivationType act_node,
+                                       const T *prev_out_value,
+                                       T *output_value,
+                                       T *gate_value,
+                                       T *reset_value,
+                                       int frame_size,
+                                       ActivationType act_node,
                                        bool origin_mode) {
   int COL = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -221,10 +237,14 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
  * grid(frame_blocks, batch_blocks)
  */
 template <class OpStateGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gate_grad, const T *prev_out_value,
-                                       T *prev_out_grad, T *output_grad,
-                                       int frame_size, int batch_size,
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad,
+                                       T *gate_value,
+                                       T *gate_grad,
+                                       const T *prev_out_value,
+                                       T *prev_out_grad,
+                                       T *output_grad,
+                                       int frame_size,
+                                       int batch_size,
                                        ActivationType active_node,
                                        bool origin_mode) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -254,9 +274,15 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
     r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value,
-                &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad,
-                &r_out_grad, active_node, origin_mode);
+  op_state_grad(&r_update_gate_value,
+                &r_update_gate_grad,
+                &r_frame_state_value,
+                &r_frame_state_grad,
+                &r_prev_out_value,
+                &r_prev_out_grad,
+                &r_out_grad,
+                active_node,
+                origin_mode);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
@@ -270,10 +296,14 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
  * grid(frame_blocks, batch_blocks)
  */
 template <class OpResetGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gate_grad, const T *prev_out_value,
-                                       T *prev_out_grad, T *reset_output_grad,
-                                       int frame_size, int batch_size,
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad,
+                                       T *gate_value,
+                                       T *gate_grad,
+                                       const T *prev_out_value,
+                                       T *prev_out_grad,
+                                       T *reset_output_grad,
+                                       int frame_size,
+                                       int batch_size,
                                        ActivationType active_gate) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
@@ -302,9 +332,14 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
     r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value,
-                &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad,
-                &r_reset_output_grad, active_gate);
+  op_reset_grad(&r_update_gate_value,
+                &r_update_gate_grad,
+                &r_reset_gate_value,
+                &r_reset_gate_grad,
+                &r_prev_out_value,
+                &r_prev_out_grad,
+                &r_reset_output_grad,
+                active_gate);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
@@ -313,6 +348,5 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
   }
 }
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h
similarity index 64%
rename from paddle/fluid/operators/math/detail/gru_kernel.h
rename to paddle/phi/kernels/funcs/detail/gru_kernel.h
index 082c2a180da87ae2caca7057efcc964a6b6f3763..db53fc4576daac993883737f5cc7041115567a78 100644
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
 // TODO(guosheng): refine code style in gru_kernel
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 namespace forward {
@@ -28,8 +27,10 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
-                             T *prev_out, T *value_reset_output,
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *value_reset_gate,
+                             T *prev_out,
+                             T *value_reset_output,
                              ActivationType act_gate,
                              T *value_reset_bias = nullptr,
                              bool old_version = true) {
@@ -48,7 +49,8 @@ class gru_resetOutput {
 #else
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_reset_gate, __m256 *prev_out,
+                             __m256 *value_reset_gate,
+                             __m256 *prev_out,
                              __m256 *value_reset_output,
                              ActivationType act_gate,
                              __m256 *value_reset_bias = nullptr,
@@ -71,9 +73,12 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
-                             T *prev_out, T *value_output,
-                             ActivationType act_input, bool origin_mode) {
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *value_frame_state,
+                             T *prev_out,
+                             T *value_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
     *value_frame_state = activation(*value_frame_state, act_input);
     if (origin_mode) {
       *value_output = ((*value_update_gate) * (*prev_out)) +
@@ -90,8 +95,10 @@ class gru_finalOutput {
 #else
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_frame_state, __m256 *prev_out,
-                             __m256 *value_output, ActivationType act_input,
+                             __m256 *value_frame_state,
+                             __m256 *prev_out,
+                             __m256 *value_output,
+                             ActivationType act_input,
                              bool origin_mode) {
     *value_frame_state = activation(*value_frame_state, act_input);
     if (origin_mode) {
@@ -116,10 +123,14 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
-                             T *value_frame_state, T *grad_frame_state,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_output, ActivationType act_input,
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_frame_state,
+                             T *grad_frame_state,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_output,
+                             ActivationType act_input,
                              bool origin_mode) {
     if (origin_mode) {
       *grad_update_gate =
@@ -127,14 +138,15 @@ class gru_stateGrad {
       *grad_prev_out += (*grad_output * (*value_update_gate));
       *grad_frame_state = activation(
           *grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
-          *value_frame_state, act_input);
+          *value_frame_state,
+          act_input);
     } else {
       *grad_update_gate =
           (*grad_output) * ((*value_frame_state) - (*value_prev_out));
       *grad_prev_out +=
           (*grad_output * (static_cast<T>(1.0) - *value_update_gate));
-      *grad_frame_state = activation(*grad_output * (*value_update_gate),
-                                     *value_frame_state, act_input);
+      *grad_frame_state = activation(
+          *grad_output * (*value_update_gate), *value_frame_state, act_input);
     }
   }
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU state grad
@@ -145,28 +157,35 @@ class gru_stateGrad {
   HOSTDEVICE void operator()(__m256 *value_update_gate,
                              __m256 *grad_update_gate,
                              __m256 *value_frame_state,
-                             __m256 *grad_frame_state, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_output,
-                             ActivationType act_input, bool origin_mode) {
+                             __m256 *grad_frame_state,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
     if (origin_mode) {
       *grad_update_gate = _mm256_mul_ps(
           *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state));
       *grad_prev_out = _mm256_add_ps(
           *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
       *grad_frame_state = activation(
-          _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f),
-                                                    *value_update_gate)),
-          *value_frame_state, act_input);
+          _mm256_mul_ps(
+              *grad_output,
+              _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
+          *value_frame_state,
+          act_input);
     } else {
       *grad_update_gate = _mm256_mul_ps(
           *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out));
       *grad_prev_out = _mm256_add_ps(
           *grad_prev_out,
-          _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f),
-                                                    *value_update_gate)));
+          _mm256_mul_ps(
+              *grad_output,
+              _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)));
       *grad_frame_state =
           activation(_mm256_mul_ps(*grad_output, *value_update_gate),
-                     *value_frame_state, act_input);
+                     *value_frame_state,
+                     act_input);
     }
   }
 #endif
@@ -176,10 +195,14 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
-                             T *value_reset_gate, T *grad_reset_gate,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_reset_output, ActivationType act_gate) {
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_reset_gate,
+                             T *grad_reset_gate,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_reset_output,
+                             ActivationType act_gate) {
     *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
     *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
     *grad_update_gate =
@@ -193,9 +216,12 @@ class gru_resetGrad {
 #else
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *grad_update_gate, __m256 *value_reset_gate,
-                             __m256 *grad_reset_gate, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_reset_output,
+                             __m256 *grad_update_gate,
+                             __m256 *value_reset_gate,
+                             __m256 *grad_reset_gate,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_reset_output,
                              ActivationType act_gate) {
     *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
     *grad_prev_out = _mm256_add_ps(
@@ -211,23 +237,31 @@ class gru_resetGrad {
 template <typename T>
 class gru {
  public:
-  HOSTDEVICE void operator()(T *value_reset_gate, T *grad_reset_gate,
-                             T *value_update_gate, T *grad_update_gate,
-                             T *value_frame_state, T *grad_frame_state,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_output, T *value_reset_output,
-                             T *grad_reset_output, ActivationType act_node,
+  HOSTDEVICE void operator()(T *value_reset_gate,
+                             T *grad_reset_gate,
+                             T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_frame_state,
+                             T *grad_frame_state,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_output,
+                             T *value_reset_output,
+                             T *grad_reset_output,
+                             ActivationType act_node,
                              ActivationType act_gate) {
     *grad_update_gate =
         activation((*grad_output) * ((*value_prev_out) - (*value_frame_state)),
-                   (*value_update_gate), act_gate);
+                   (*value_update_gate),
+                   act_gate);
     *grad_prev_out += (*grad_output * (*value_update_gate));
     *grad_frame_state =
         activation(*grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
-                   *value_frame_state, act_node);
+                   *value_frame_state,
+                   act_node);
     T reset_output = (*value_reset_output) / (*value_reset_gate);
-    *grad_reset_gate = activation(reset_output * (*grad_frame_state),
-                                  *value_reset_gate, act_gate);
+    *grad_reset_gate = activation(
+        reset_output * (*grad_frame_state), *value_reset_gate, act_gate);
     *grad_reset_output = (*value_reset_gate) * (*grad_frame_state);
   }
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU CPU
@@ -235,29 +269,36 @@ class gru {
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_reset_gate, __m256 *grad_reset_gate,
+  HOSTDEVICE void operator()(__m256 *value_reset_gate,
+                             __m256 *grad_reset_gate,
                              __m256 *value_update_gate,
                              __m256 *grad_update_gate,
                              __m256 *value_frame_state,
-                             __m256 *grad_frame_state, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_output,
+                             __m256 *grad_frame_state,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_output,
                              __m256 *value_reset_output,
-                             __m256 *grad_reset_output, ActivationType act_node,
+                             __m256 *grad_reset_output,
+                             ActivationType act_node,
                              ActivationType act_gate) {
     *grad_update_gate = activation(
         _mm256_mul_ps(*grad_output,
                       _mm256_sub_ps(*value_prev_out, *value_frame_state)),
-        *value_update_gate, act_gate);
+        *value_update_gate,
+        act_gate);
     *grad_prev_out = _mm256_add_ps(
         *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
     *grad_frame_state = activation(
         _mm256_mul_ps(*grad_output,
                       _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
-        *value_frame_state, act_node);
+        *value_frame_state,
+        act_node);
     __m256 reset_output = _mm256_div_ps(*value_reset_output, *value_reset_gate);
     *grad_reset_gate =
         activation(_mm256_mul_ps(reset_output, *grad_frame_state),
-                   *value_reset_gate, act_gate);
+                   *value_reset_gate,
+                   act_gate);
     *grad_reset_output = _mm256_mul_ps(*value_reset_gate, *grad_frame_state);
   }
 #endif
@@ -267,6 +308,5 @@ class gru {
 }  // namespace backward
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
similarity index 65%
rename from paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
index 169c5488bb57aeaa2f74acbf825448fd99940e88..10dbf27d348876f2edf361e4231b522a6d2fb711 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 
 #if defined(_WIN32)
 #if defined(__AVX2__) || defined(__AVX__)
@@ -25,21 +25,23 @@ inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
 #endif
 #endif
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 using Array1 = Eigen::DSizes<int64_t, 1>;
-template <typename T, int MajorType = Eigen::RowMajor,
+template <typename T,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
 
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM CPU
 
 template <class T, class Op>
-void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, T cell_clip,
+void naive_lstm_forward_one_sequence(Op op,
+                                     phi::funcs::LstmMetaValue<T> value,
+                                     int frame_size,
+                                     T cell_clip,
                                      ActivationType active_node,
                                      ActivationType active_gate,
                                      ActivationType active_state,
@@ -79,9 +81,21 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
       r_prev_state = value.prev_state_value[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -94,9 +108,12 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frame_size,
-                                      T cell_clip, ActivationType active_node,
+void naive_lstm_backward_one_sequence(Op op,
+                                      phi::funcs::LstmMetaValue<T> value,
+                                      phi::funcs::LstmMetaGrad<T> grad,
+                                      int frame_size,
+                                      T cell_clip,
+                                      ActivationType active_node,
                                       ActivationType active_gate,
                                       ActivationType active_state,
                                       bool old_api_version) {
@@ -157,11 +174,30 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
       r_prev_state = value.prev_state_value[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
-       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
-       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
-       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -179,8 +215,10 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, T cell_clip,
+void avx_lstm_forward_one_sequence(Op op,
+                                   phi::funcs::LstmMetaValue<T> value,
+                                   int frame_size,
+                                   T cell_clip,
                                    ActivationType active_node,
                                    ActivationType active_gate,
                                    ActivationType active_state,
@@ -226,9 +264,21 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
           (reinterpret_cast<__m256 const *>(value.prev_state_value))[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -242,9 +292,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frame_size,
-                                    T cell_clip, ActivationType active_node,
+void avx_lstm_backward_one_sequence(Op op,
+                                    phi::funcs::LstmMetaValue<T> value,
+                                    phi::funcs::LstmMetaGrad<T> grad,
+                                    int frame_size,
+                                    T cell_clip,
+                                    ActivationType active_node,
                                     ActivationType active_gate,
                                     ActivationType active_state,
                                     bool old_api_version) {
@@ -311,11 +364,30 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
           (reinterpret_cast<__m256 const *>(value.prev_state_value))[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
-       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
-       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
-       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -338,8 +410,10 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T>
-void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context,
-                                     LstmMetaValue<T> value, int frame_size) {
+void eigen_lstm_forward_one_sequence(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::LstmMetaValue<T> value,
+    int frame_size) {
   auto eigen_value_ig =
       typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
   auto eigen_value_fg = typename EigenVector<T>::Type(
@@ -356,10 +430,10 @@ void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context,
       typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
 
   auto &place = *context.eigen_device();
-  TanhFunctor<T>()(place, eigen_value_in, eigen_value_in);
-  SigmoidFunctor<T>()(place, eigen_value_ig, eigen_value_ig);
-  SigmoidFunctor<T>()(place, eigen_value_fg, eigen_value_fg);
-  SigmoidFunctor<T>()(place, eigen_value_og, eigen_value_og);
+  paddle::operators::TanhFunctor<T>()(place, eigen_value_in, eigen_value_in);
+  paddle::operators::SigmoidFunctor<T>()(place, eigen_value_ig, eigen_value_ig);
+  paddle::operators::SigmoidFunctor<T>()(place, eigen_value_fg, eigen_value_fg);
+  paddle::operators::SigmoidFunctor<T>()(place, eigen_value_og, eigen_value_og);
 
   eigen_state.device(place) = eigen_value_in * eigen_value_ig;
   if (value.prev_state_value) {
@@ -368,14 +442,16 @@ void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context,
     eigen_state.device(place) = eigen_state + eigen_prev_state * eigen_value_fg;
   }
 
-  TanhFunctor<T>()(place, eigen_state, eigen_state_act);
+  paddle::operators::TanhFunctor<T>()(place, eigen_state, eigen_state_act);
   eigen_output.device(place) = eigen_value_og * eigen_state_act;
 }
 
 template <class T>
-void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context,
-                                      LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frame_size) {
+void eigen_lstm_backward_one_sequence(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::LstmMetaValue<T> value,
+    phi::funcs::LstmMetaGrad<T> grad,
+    int frame_size) {
   auto eigen_value_ig =
       typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
   auto eigen_value_fg = typename EigenVector<T>::Type(
@@ -401,23 +477,38 @@ void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context,
       typename EigenVector<T>::Type(grad.state_grad, Array1(frame_size));
 
   auto &place = *context.eigen_device();
-  SigmoidGradFunctor<T>()(place, 1 /*useless*/, eigen_value_og,
-                          eigen_grad_output * eigen_state_act, eigen_grad_og);
+  paddle::operators::SigmoidGradFunctor<T>()(
+      place,
+      1 /*useless*/,
+      eigen_value_og,
+      eigen_grad_output * eigen_state_act,
+      eigen_grad_og);
   eigen_grad_state.device(place) =
       eigen_grad_state +
       eigen_grad_output * eigen_value_og *
           (static_cast<T>(1) - eigen_state_act * eigen_state_act);
-  TanhGradFunctor<T>()(place, 1, eigen_value_in,
-                       eigen_grad_state * eigen_value_ig, eigen_grad_in);
-  SigmoidGradFunctor<T>()(place, 1, eigen_value_ig,
-                          eigen_grad_state * eigen_value_in, eigen_grad_ig);
+  paddle::operators::TanhGradFunctor<T>()(place,
+                                          1,
+                                          eigen_value_in,
+                                          eigen_grad_state * eigen_value_ig,
+                                          eigen_grad_in);
+  paddle::operators::SigmoidGradFunctor<T>()(place,
+                                             1,
+                                             eigen_value_ig,
+                                             eigen_grad_state * eigen_value_in,
+                                             eigen_grad_ig);
   if (value.prev_state_value) {
     auto eigen_prev_state = typename EigenVector<T>::ConstType(
         value.prev_state_value, Array1(frame_size));
-    SigmoidGradFunctor<T>()(place, 1, eigen_value_fg,
-                            eigen_grad_state * eigen_prev_state, eigen_grad_fg);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place,
+        1,
+        eigen_value_fg,
+        eigen_grad_state * eigen_prev_state,
+        eigen_grad_fg);
   } else {
-    SigmoidGradFunctor<T>()(place, 1, eigen_value_fg, 0, eigen_grad_fg);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place, 1, eigen_value_fg, 0, eigen_grad_fg);
   }
   if (grad.prev_state_grad) {
     auto eigen_grad_pre_state =
@@ -427,42 +518,74 @@ void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context,
 }
 
 template <class T, class Op>
-void cpu_lstm_forward(const platform::CPUDeviceContext &context, Op op,
-                      LstmMetaValue<T> value, int frame_size, T cell_clip,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state, bool old_api_version) {
+void cpu_lstm_forward(const paddle::platform::CPUDeviceContext &context,
+                      Op op,
+                      phi::funcs::LstmMetaValue<T> value,
+                      int frame_size,
+                      T cell_clip,
+                      ActivationType active_node,
+                      ActivationType active_gate,
+                      ActivationType active_state,
+                      bool old_api_version) {
   if (!old_api_version) {
     eigen_lstm_forward_one_sequence<T>(context, value, frame_size);
   } else {
     if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-      avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
-                                       active_node, active_gate, active_state,
+      avx_lstm_forward_one_sequence<T>(op,
+                                       value,
+                                       frame_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state,
                                        old_api_version);
     } else {
-      naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
-                                         active_node, active_gate, active_state,
+      naive_lstm_forward_one_sequence<T>(op,
+                                         value,
+                                         frame_size,
+                                         cell_clip,
+                                         active_node,
+                                         active_gate,
+                                         active_state,
                                          old_api_version);
     }
   }
 }
 
 template <class T, class Op>
-void cpu_lstm_backward(const platform::CPUDeviceContext &context, Op op,
-                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, T cell_clip, ActivationType active_node,
-                       ActivationType active_gate, ActivationType active_state,
+void cpu_lstm_backward(const paddle::platform::CPUDeviceContext &context,
+                       Op op,
+                       phi::funcs::LstmMetaValue<T> value,
+                       phi::funcs::LstmMetaGrad<T> grad,
+                       int frame_size,
+                       T cell_clip,
+                       ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state,
                        bool old_api_version) {
   if (!old_api_version) {
     eigen_lstm_backward_one_sequence<T>(context, value, grad, frame_size);
   } else {
     if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-      avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
-                                        active_node, active_gate, active_state,
+      avx_lstm_backward_one_sequence<T>(op,
+                                        value,
+                                        grad,
+                                        frame_size,
+                                        cell_clip,
+                                        active_node,
+                                        active_gate,
+                                        active_state,
                                         old_api_version);
     } else {
-      naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
-                                          cell_clip, active_node, active_gate,
-                                          active_state, old_api_version);
+      naive_lstm_backward_one_sequence<T>(op,
+                                          value,
+                                          grad,
+                                          frame_size,
+                                          cell_clip,
+                                          active_node,
+                                          active_gate,
+                                          active_state,
+                                          old_api_version);
     }
   }
 }
@@ -470,6 +593,5 @@ void cpu_lstm_backward(const platform::CPUDeviceContext &context, Op op,
 #endif  // @{ End Group LSTM CPU
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
similarity index 68%
rename from paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
index 851a62dbe9a48d601ab0b26106664abff7b1bb91..6d4c430d9e64832fca453575eb9f47cb8d19a8ec 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -15,14 +15,13 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 /*
@@ -30,8 +29,11 @@ namespace detail {
  * grid(frame_blocks, batch_blocks)
  */
 template <class T, class Op, bool is_batch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, T cell_clip,
+__global__ void KeLstmForward(Op op,
+                              phi::funcs::LstmMetaValue<T> value,
+                              int frame_size,
+                              int batch_size,
+                              T cell_clip,
                               ActivationType active_node,
                               ActivationType active_gate,
                               ActivationType active_state) {
@@ -71,9 +73,21 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
     r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-     &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     &cell_clip, active_node, active_gate, active_state);
+  op(&r_value_in,
+     &r_value_ig,
+     &r_value_fg,
+     &r_value_og,
+     &r_prev_state,
+     &r_state,
+     &r_state_atv,
+     &r_out,
+     &r_checkI,
+     &r_checkF,
+     &r_checkO,
+     &cell_clip,
+     active_node,
+     active_gate,
+     active_state);
 
   value.gate_value[frame_idx] = r_value_in;
   value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -90,9 +104,12 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
  * grid(frame_blocks, batch_blocks)
  */
 template <class T, class Op, bool is_batch>
-__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, T cell_clip,
+__global__ void KeLstmBackward(Op op,
+                               phi::funcs::LstmMetaValue<T> value,
+                               phi::funcs::LstmMetaGrad<T> grad,
+                               int frame_size,
+                               int batch_size,
+                               T cell_clip,
                                ActivationType active_node,
                                ActivationType active_gate,
                                ActivationType active_state) {
@@ -147,11 +164,30 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
     r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
-     &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
-     &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
-     active_node, active_gate, active_state);
+  op(&r_value_in,
+     &r_value_ig,
+     &r_value_fg,
+     &r_value_og,
+     &r_grad_in,
+     &r_grad_ig,
+     &r_grad_fg,
+     &r_grad_og,
+     &r_prev_state,
+     &r_prev_state_grad,
+     &r_state,
+     &r_state_grad,
+     &r_state_atv,
+     &r_output_grad,
+     &r_checkI,
+     &r_checkF,
+     &r_checkO,
+     &r_checkIGrad,
+     &r_checkFGrad,
+     &r_checkOGrad,
+     &cell_clip,
+     active_node,
+     active_gate,
+     active_state);
 
   grad.gate_grad[frame_idx] = r_grad_in;
   grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
@@ -185,10 +221,15 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, ActivationType active_node,
-                      ActivationType active_gate, ActivationType active_state) {
+void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
+                      Op op,
+                      phi::funcs::LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      ActivationType active_node,
+                      ActivationType active_gate,
+                      ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
@@ -203,25 +244,45 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
   }
 
   auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+      reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context)
+          .stream();
   if (batch_size == 1) {
-    KeLstmForward<T, Op,
+    KeLstmForward<T,
+                  Op,
                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
+        op,
+        value,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
         active_state);
   } else {
-    KeLstmForward<T, Op,
+    KeLstmForward<T,
+                  Op,
                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
+        op,
+        value,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
         active_state);
   }
 }
 
 template <class T, class Op>
-void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
-                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size, T cell_clip,
-                       ActivationType active_node, ActivationType active_gate,
+void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
+                       Op op,
+                       phi::funcs::LstmMetaValue<T> value,
+                       phi::funcs::LstmMetaGrad<T> grad,
+                       int frame_size,
+                       int batch_size,
+                       T cell_clip,
+                       ActivationType active_node,
+                       ActivationType active_gate,
                        ActivationType active_state) {
   dim3 threads;
   dim3 grid;
@@ -237,21 +298,37 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
   }
 
   auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+      reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context)
+          .stream();
   if (batch_size == 1) {
-    KeLstmBackward<T, Op,
+    KeLstmBackward<T,
+                   Op,
                    /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, cell_clip, active_node,
-        active_gate, active_state);
+        op,
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
+        active_state);
   } else {
-    KeLstmBackward<T, Op,
+    KeLstmBackward<T,
+                   Op,
                    /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, cell_clip, active_node,
-        active_gate, active_state);
+        op,
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
+        active_state);
   }
 }
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
similarity index 59%
rename from paddle/fluid/operators/math/detail/lstm_kernel.h
rename to paddle/phi/kernels/funcs/detail/lstm_kernel.h
index 2d4e7dd59fb8b7f512f7724376f254f4cfa1ec6d..8b429264125259932ac7b0b3f5b27eca7524d604 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
@@ -14,12 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 namespace forward {
@@ -27,9 +26,18 @@ namespace forward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
-                             T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO, T *cell_clip,
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *prev_state,
+                             T *state,
+                             T *state_atv,
+                             T *output,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -57,11 +65,18 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
 
-  HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig,
-                             __m256 *value_fg, __m256 *value_og,
-                             __m256 *prev_state, __m256 *state,
-                             __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO, T *cell_clip,
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *prev_state,
+                             __m256 *state,
+                             __m256 *state_atv,
+                             __m256 *output,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -97,12 +112,27 @@ namespace backward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
-                             T *grad_in, T *grad_ig, T *grad_fg, T *grad_og,
-                             T *prev_state, T *prev_state_grad, T *state,
-                             T *state_grad, T *state_atv, T *output_grad,
-                             T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad, T *cell_clip,
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *grad_in,
+                             T *grad_ig,
+                             T *grad_fg,
+                             T *grad_og,
+                             T *prev_state,
+                             T *prev_state_grad,
+                             T *state,
+                             T *state_grad,
+                             T *state_atv,
+                             T *output_grad,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *checkIGrad,
+                             T *checkFGrad,
+                             T *checkOGrad,
+                             T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -138,17 +168,32 @@ class lstm {
 #else
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
-  HOSTDEVICE void operator()(
-      __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og,
-      __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og,
-      __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
-      __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
-      __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
-      ActivationType active_node, ActivationType active_gate,
-      ActivationType active_state) {
-    *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
-                          active_gate);
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *grad_in,
+                             __m256 *grad_ig,
+                             __m256 *grad_fg,
+                             __m256 *grad_og,
+                             __m256 *prev_state,
+                             __m256 *prev_state_grad,
+                             __m256 *state,
+                             __m256 *state_grad,
+                             __m256 *state_atv,
+                             __m256 *output_grad,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             __m256 *checkIGrad,
+                             __m256 *checkFGrad,
+                             __m256 *checkOGrad,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *grad_og = activation(
+        _mm256_mul_ps(*output_grad, *state_atv), *value_og, active_gate);
     if (*cell_clip > 0.0f) {
       T *state_ = reinterpret_cast<T *>(state);
       if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
@@ -156,18 +201,19 @@ class lstm {
       } else {
         *state_grad =
             _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
-                                     *state_atv, active_state),
+                                     *state_atv,
+                                     active_state),
                           *state_grad);
         *state_grad =
             _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
       }
     }
-    *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
-                          active_node);
-    *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
-                          active_gate);
-    *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg,
-                          active_gate);
+    *grad_in = activation(
+        _mm256_mul_ps(*state_grad, *value_ig), *value_in, active_node);
+    *grad_ig = activation(
+        _mm256_mul_ps(*state_grad, *value_in), *value_ig, active_gate);
+    *grad_fg = activation(
+        _mm256_mul_ps(*state_grad, *prev_state), *value_fg, active_gate);
     *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI),
                                      _mm256_mul_ps(*grad_fg, *checkF));
     *prev_state_grad =
@@ -183,6 +229,5 @@ class lstm {
 }  // namespace backward
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
index a806d1583a0b363d44aa9f0cf3b3a64f4a8ea6ff..1862f5ec91b4bc2aeafa9263d89fcbc5b8dd600b 100644
--- a/paddle/phi/kernels/funcs/diag_functor.h
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -14,6 +14,14 @@
 
 #pragma once
 
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
 namespace phi {
 namespace funcs {
 
@@ -25,5 +33,96 @@ inline int ComputeStride(int axis, phi::DDim dims) {
   return size;
 }
 
+template <typename T, typename ValueType>
+struct DiagAndFillFunctor {
+  DiagAndFillFunctor(const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const ValueType* scale,
+                     const T* input,
+                     T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = input_[index];
+    } else if (col == band_end - 1) {
+      output_[index] = static_cast<T>(scale_[index % m_]);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+ private:
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const ValueType* scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename ValueType, typename Context>
+DenseTensor DiagFill(const Context& dev_ctx,
+                     const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const DenseTensor& scale,
+                     const DenseTensor& input) {
+  DenseTensor out;
+  out.Resize(input.dims());
+  dev_ctx.template Alloc<T>(&out);
+  funcs::ForRange<Context> for_range(dev_ctx, input.numel());
+  DiagAndFillFunctor<T, ValueType> diag_and_copy_functor(
+      m,
+      n,
+      num_lower_diags,
+      num_upper_diags,
+      scale.data<ValueType>(),
+      input.data<T>(),
+      out.data<T>());
+  for_range(diag_and_copy_functor);
+  return out;
+}
+
+template <typename T, typename Context>
+DenseTensor BatchDiag(const Context& dev_ctx, const DenseTensor& x, int batch) {
+  DenseTensor out;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  auto numel = x.numel();
+  out.Resize(x.dims());
+  auto* out_data = dev_ctx.template HostAlloc<phi::dtype::Real<T>>(
+      &out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
+
+  auto x_dims = x.dims();
+  int num_dims = x_dims.size();
+  std::vector<int> out_shape;
+
+  for (int i = 0; i < num_dims - 1; ++i) {
+    out_shape.push_back(x.dims()[i]);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  int order = x.dims()[num_dims - 1];
+  int stride_out = order * order;
+  int stride_in = order + 1;
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < order; ++j) {
+      out_data[i * order + j] = x_data[stride_out * i + stride_in * j];
+    }
+  }
+  return out;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
similarity index 63%
rename from paddle/fluid/operators/distribution_helper.h
rename to paddle/phi/kernels/funcs/distribution_helper.h
index c13bf687af23470d4595def6fb6fabf7385c999f..acc31d68b785908a88fea14e5826ccfde9ff5970 100644
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,14 +21,15 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
@@ -39,10 +40,8 @@ limitations under the License. */
 #define UNLIKELY(condition) (condition)
 #endif
 
-namespace paddle {
-namespace distribution {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /********************* Transformation Function **********************/
 template <typename T>
@@ -66,8 +65,9 @@ struct exponential_transform {
 };
 
 template <typename T>
-struct uniform_transform {
-  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
+struct uniform_real_transform {
+  explicit uniform_real_transform(T min, T max)
+      : range_(max - min), min_(min) {}
 
   HOSTDEVICE inline T operator()(T val) const {
     if (UNLIKELY(val == static_cast<T>(1.0))) {
@@ -82,6 +82,22 @@ struct uniform_transform {
   T min_;
 };
 
+template <typename T, typename R>
+struct uniform_int_transform {
+  explicit uniform_int_transform(int min, int max) {
+    range_ = static_cast<uint32_t>(max - min);
+    min_ = min;
+  }
+
+  HOSTDEVICE inline T operator()(R rand) const {
+    return static_cast<T>(static_cast<int>(rand % range_) + min_);
+  }
+
+ private:
+  uint32_t range_;
+  int min_;
+};
+
 template <typename T>
 struct normal_transform {
   explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
@@ -122,6 +138,27 @@ struct uniform_distribution<double> {
   static constexpr int kReturnsCount = 2;
 };
 
+template <>
+struct uniform_distribution<uint32_t> {
+  __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<uint64_t> {
+  __device__ inline ulonglong2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    ulonglong2 result;
+    uint4 rand = curand4(state);
+    result.x = (uint64_t)rand.x << 32 | rand.y;
+    result.y = (uint64_t)rand.z << 32 | rand.w;
+    return result;
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
 template <>
 struct normal_distribution<float> {
   __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
@@ -158,6 +195,27 @@ struct uniform_distribution<double> {
   static constexpr int kReturnsCount = 2;
 };
 
+template <>
+struct uniform_distribution<uint32_t> {
+  __device__ inline uint4 operator()(hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<uint64_t> {
+  __device__ inline ulonglong2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    ulonglong2 result;
+    uint4 rand = hiprand4(state);
+    result.x = (uint64_t)rand.x << 32 | rand.y;
+    result.y = (uint64_t)rand.z << 32 | rand.w;
+    return result;
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
 template <>
 struct normal_distribution<float> {
   __device__ inline float4 operator()(
@@ -179,8 +237,12 @@ struct normal_distribution<double> {
 
 /******** Launch GPU function of distribution and transformation *********/
 template <typename T, typename DistOp, typename TransformOp>
-__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans, T *out_data,
+__global__ void DistributionKernel(size_t size,
+                                   uint64_t seed,
+                                   uint64_t offset,
+                                   DistOp dist,
+                                   TransformOp trans,
+                                   T *out_data,
                                    size_t stride) {
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
   static constexpr int kCount = DistOp::kReturnsCount;
@@ -194,30 +256,36 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
   using SType = hiprandStatePhilox4_32_10_t;
 #endif
   size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT args[kCount];
   T result[kCount];
   for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
-                                                           trans);
-    kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, stride, 1);
+    kps::ElementwiseRandom<SType, MT, kCount, 1, DistOp>(
+        &args[0], dist, &state);
+    kps::ElementwiseUnary<MT, T, kCount, 1, 1, TransformOp>(
+        &result[0], &args[0], trans);
+    kps::WriteData<T, T, kCount, 1, 1, true>(
+        out_data + i, &result[0], size - i, 1, stride, 1);
     __syncthreads();
   }
 }
 
 template <typename T, typename DistOp, typename TransformOp>
-void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
-                                Tensor *out, DistOp dist, TransformOp trans) {
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+void distribution_and_transform(const GPUContext &ctx,
+                                DenseTensor *out,
+                                DistOp dist,
+                                TransformOp trans) {
+  T *out_data = ctx.template Alloc<T>(out);
   auto size = out->numel();
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  if (size == 0) return;
+  auto gen_cuda = ctx.GetGenerator();
 
   size_t block_size = 256;
   size_t expect_grid_size = (size + block_size - 1) / block_size;
-  const auto &prop = platform::GetDeviceProperties(device_id);
+
+  int64_t device_id = ctx.GetPlace().GetDeviceId();
+  const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
+
   size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
                          prop.multiProcessorCount;
   size_t grid_size =
@@ -233,12 +301,13 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
   uint64_t seed = seed_offset.first;
   uint64_t offset = seed_offset.second;
 
-  DistributionKernel<
-      T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  DistributionKernel<T,
+                     DistOp,
+                     TransformOp><<<grid_size, block_size, 0, ctx.stream()>>>(
       size, seed, offset, dist, trans, out_data, total_thread);
 }
 
 #endif
 
-}  // namespace distribution
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index d369781f845eb0887817f83be761b1027fc0bab0..332ec0b0312da96ca21b2c616440afc57a62edc2 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -18,13 +18,14 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #define HOSTDEVICE __host__ __device__
@@ -44,28 +45,6 @@ using ConditionalT =
 namespace funcs {
 using DDim = phi::DDim;
 
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-struct ElemwiseGradNoBroadcast {
-  const T *x_;
-  const T *y_;
-  const Tout *out_;
-  const Tout *dout_;
-
-  HOSTDEVICE void operator()(size_t i) {
-    if (dx_ != nullptr) {
-      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-    if (dy_ != nullptr) {
-      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-  }
-
-  DX_OP dx_op_;
-  DY_OP dy_op_;
-  T *dx_;
-  T *dy_;
-};
-
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 
@@ -293,73 +272,172 @@ class TransformFunctor {
   bool is_xsize_larger_;
 };
 
-inline DDim trim_trailing_singular_dims(const DDim &dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
-  }
-  if (actual_dims_size == dims.size()) return dims;
-  std::vector<int> trim_dims;
-  trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
-    trim_dims[i] = dims[i];
-  }
-  if (trim_dims.size() == 0) {
-    return DDim(phi::make_dim());
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const DenseTensor &x,
+                               const DenseTensor &y,
+                               DenseTensor *z,
+                               int *x_dims_array,
+                               int *y_dims_array,
+                               int *out_dims_array,
+                               int max_dim,
+                               const CPUContext &ctx,
+                               Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  PADDLE_ENFORCE_NOT_NULL(
+      x_data, errors::InvalidArgument("The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      y_data, errors::InvalidArgument("The input Y should not be empty."));
+  OutType *out_data = ctx.Alloc<OutType>(z);
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
   }
-  DDim actual_dims = phi::make_ddim(trim_dims);
-  return actual_dims;
 }
 
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
- *
- * New parameter: *is_run_common_broadcast* is a flag to record whether to run
- * common broadcast code.
- */
-inline void get_mid_dims(const DDim &x_dims,
-                         const DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post,
-                         int *is_run_common_broadcast) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  *is_run_common_broadcast = 0;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    if (x_dims[i + axis] != y_dims[i]) {
-      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
-                        true,
-                        phi::errors::InvalidArgument(
-                            "Broadcast dimension mismatch. Operands "
-                            "could not be broadcast together with the shape of "
-                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
-                            "in X is not equal to [%d] in Y.",
-                            x_dims,
-                            y_dims,
-                            x_dims[i + axis],
-                            y_dims[i]));
-      *is_run_common_broadcast = 1;
-      return;
-    }
-    (*n) *= y_dims[i];
-  }
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx,
+                                       const DenseTensor &x,
+                                       const DenseTensor &y,
+                                       DenseTensor *z,
+                                       const DDim &x_dims,
+                                       const DDim &y_dims,
+                                       Functor func,
+                                       int axis,
+                                       const bool is_xsize_larger = true) {
+  int max_dim = (std::max)(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      phi::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    phi::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+
+  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
+                                                 y,
+                                                 z,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 dev_ctx,
+                                                 func,
+                                                 is_xsize_larger);
+}
+
+// It is a common CPU implementation to compute binary calculation with the
+// support of broadcast. Note:
+// 1. CPU implementation cannot support the case when x needs broadcast, thus
+//    this function need to be called with XxxFunctor and XxxInverseFunctor,
+//    like AddFunctor and InverseAddFunctor.
+// 2. The corresponding GPU implementation supports all the broadcast cases,
+//    thus there is no need to define and call with XxxInverseFunctor.
+// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
+// cases and avoid the need of XxxInverseFunctor.
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const CPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  dev_ctx.Alloc<OutType>(z);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+  TransformFunctor<Functor, T, CPUContext, OutType> functor(
+      x, y, z, dev_ctx, func, is_xsize_larger);
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  if (is_run_common_broadcast == 1) {
+    CommonElementwiseBroadcastForward<Functor, T, OutType>(
+        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+    return;
+  }
+
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
   }
 }
 
@@ -395,41 +473,11 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
     auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
     *ddx_safe = phi::Empty(dev_ctx, std::move(meta));
     ddx_safe->mutable_data(dev_ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
   }
 }
 
-template <typename DeviceContext,
-          typename T,
-          typename DX_OP,
-          typename DY_OP,
-          typename Tout = T>
-void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
-                                    const DDim &x_dim,
-                                    const DDim &y_dim,
-                                    const DenseTensor &x,
-                                    const DenseTensor &y,
-                                    const DenseTensor &out,
-                                    const DenseTensor &dout,
-                                    int axis,
-                                    DenseTensor *dx,
-                                    DenseTensor *dy,
-                                    DX_OP dx_op,
-                                    DY_OP dy_op) {
-  size_t N = static_cast<size_t>(phi::product(x_dim));
-  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
-  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
-      x.data<T>(),
-      y.data<T>(),
-      out.data<Tout>(),
-      dout.data<Tout>(),
-      dx_op,
-      dy_op,
-      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
-      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
-}
-
 inline void ElementwiseGradPreProcess(const DenseTensor &dout,
                                       DenseTensor *dx) {
   if (dx != nullptr) {
@@ -498,9 +546,8 @@ struct VecSizeGetter {
                                const ArgsT &args,
                                int *vec_size) {
     using Type = std::tuple_element_t<Index, ArgsT>;
-    *vec_size = std::min<int>(
-        *vec_size,
-        paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
+    *vec_size = std::min<int>(*vec_size,
+                              phi::GetVectorizedSize(ins[Index]->data<Type>()));
   }
 };
 
@@ -515,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
   // The Arg VecSize=1 is to match the Unroller template.
   Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
+    vec_size =
+        std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
   }
   return vec_size;
 }
@@ -806,6 +853,7 @@ void ElementwiseKernel(const KPDevice &ctx,
     }
   }
 }
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index c0a3985cd1713b3bf66b278de0fd17cb408e0f63..b01d50015f01ad2fb2b1ab7c0c0be6f4f1b5acb8 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -92,5 +93,116 @@ struct InverseDivideFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; }
 };
 
+template <typename T>
+using ComplexType = phi::dtype::complex<T>;
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    // dx = dout / y
+    // dy = - dout * out / y
+    phi::Array<OutT, 2> outs;
+    outs[0] = a / c;
+    outs[1] = -a * b / c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    ComplexType<InT> out_div_c_conj((b / c).real, -(b / c).imag);
+    outs[0] = a / c_conj;
+    outs[1] = -a * out_div_c_conj;
+    return outs;
+  }
+};
+
+// Float div grad
+template <typename T>
+struct DivGradXFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+// ComplexType div grad
+template <typename T>
+struct DivGradXFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a / b_conj;
+  }
+};
+
+// Float mul and div
+template <typename T>
+struct DivGradYFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
+    return -a * b / c;
+  }
+};
+
+// ComplexType mul and div
+template <typename T>
+struct DivGradYFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b,
+                                              const ComplexType<T> c) const {
+    ComplexType<T> out_div_c_conj((b / c).real, -(b / c).imag);
+    return -a * out_div_c_conj;
+  }
+};
+
+template <typename T>
+struct MultiplyGradFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
+};
+template <typename T>
+struct MultiplyGradFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    // dx = dout * y
+    ComplexType<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
similarity index 77%
rename from paddle/phi/kernels/gpu/elementwise.h
rename to paddle/phi/kernels/funcs/elementwise_grad_base.h
index 12cafc7023bb5100d5f619aeec29a357a13e4935..17bf873587381c54efa23883f10e830c296365ec 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -14,16 +14,26 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+#endif
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
 #else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
+
 #define BLOCK_X 32
 #define BLOCK_Y 32
 
@@ -36,21 +46,361 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 
 namespace phi {
 
-// General binary elementwise comutaion with the support of broadcast.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const GPUContext &dev_ctx,
-                        const DenseTensor &x,
-                        const DenseTensor &y,
-                        int axis,
-                        Functor func,
-                        DenseTensor *z) {
-  std::vector<const DenseTensor *> ins = {&x, &y};
-  std::vector<DenseTensor *> outs = {z};
-  z->mutable_data<OutType>(dev_ctx.GetPlace());
-  phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, OutType>(
-      dev_ctx, ins, &outs, axis, func);
+namespace funcs {
+using DDim = phi::DDim;
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonGradBroadcastCPU(const DenseTensor &x,
+                            const DenseTensor &y,
+                            const DenseTensor &out,
+                            const DenseTensor &dout,
+                            DenseTensor *dx,
+                            DenseTensor *dy,
+                            int *x_dims_array,
+                            int *y_dims_array,
+                            int *out_dims_array,
+                            int max_dim,
+                            const CPUContext &ctx,
+                            DX_OP dx_op,
+                            DY_OP dy_op) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  const Tout *out_data = out.data<Tout>();
+  const Tout *dout_data = dout.data<Tout>();
+  T *dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
+  T *dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
+  if (dx_data != nullptr) {
+    memset(dx_data, 0, dx->numel() * sizeof(T));
+  }
+  if (dy_data != nullptr) {
+    memset(dy_data, 0, dy->numel() * sizeof(T));
+  }
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (dx_data != nullptr) {
+      dx_data[x_index] += dx_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+    if (dy_data != nullptr) {
+      dy_data[y_index] += dy_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast1CPU(const T *x,
+                                      const T *y,
+                                      const Tout *out,
+                                      const Tout *dout,
+                                      int h,
+                                      int w,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T *dx,
+                                      T *dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int x_offset = i * w + j;
+        if (dx != nullptr) {
+          dx[x_offset] =
+              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+        }
+        if (dy != nullptr) {
+          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          if (i == 0) {
+            dy[j] = tmp;
+          } else {
+            dy[j] += tmp;
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int y_offset = i * w + j;
+        if (dy != nullptr) {
+          dy[y_offset] =
+              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+        }
+        if (dx != nullptr) {
+          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          if (i == 0) {
+            dx[j] = tmp;
+          } else {
+            dx[j] += tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast2CPU(const T *x,
+                                      const T *y,
+                                      const Tout *out,
+                                      const Tout *dout,
+                                      int pre,
+                                      int n,
+                                      int post,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T *dx,
+                                      T *dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int x_offset = i * n * post + j * post + k;
+          if (dx != nullptr) {
+            dx[x_offset] =
+                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          }
+          if (dy != nullptr) {
+            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+            if (i == 0 && k == 0) {
+              dy[j] = tmp;
+            } else {
+              dy[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int y_offset = i * n * post + j * post + k;
+          if (dy != nullptr) {
+            dy[y_offset] =
+                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          }
+          if (dx != nullptr) {
+            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+            if (i == 0 && k == 0) {
+              dx[j] = tmp;
+            } else {
+              dx[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
+                                        const DDim &x_dims,
+                                        const DDim &y_dims,
+                                        const DenseTensor &x,
+                                        const DenseTensor &y,
+                                        const DenseTensor &out,
+                                        const DenseTensor &dout,
+                                        int axis,
+                                        DenseTensor *dx,
+                                        DenseTensor *dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << phi::make_ddim(x_dims_array)
+          << " ydim:" << phi::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
+                                                y,
+                                                out,
+                                                dout,
+                                                dx,
+                                                dy,
+                                                x_dims_array.data(),
+                                                y_dims_array.data(),
+                                                out_dims_array.data(),
+                                                max_dim,
+                                                ctx,
+                                                dx_op,
+                                                dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx,
+                                      const DDim &x_dims,
+                                      const DDim &y_dims,
+                                      const DenseTensor &x,
+                                      const DenseTensor &y,
+                                      const DenseTensor &out,
+                                      const DenseTensor &dout,
+                                      int axis,
+                                      DenseTensor *dx,
+                                      DenseTensor *dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
+  } else {
+    ElemwiseGradBroadcast2CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              post,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
+  }
 }
 
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+struct ElemwiseGradNoBroadcast {
+  const T *x_;
+  const T *y_;
+  const Tout *out_;
+  const Tout *dout_;
+
+  HOSTDEVICE void operator()(size_t i) {
+    if (dx_ != nullptr) {
+      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+    if (dy_ != nullptr) {
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+  }
+
+  DX_OP dx_op_;
+  DY_OP dy_op_;
+  T *dx_;
+  T *dy_;
+};
+
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
+                                    const DDim &x_dim,
+                                    const DDim &y_dim,
+                                    const DenseTensor &x,
+                                    const DenseTensor &y,
+                                    const DenseTensor &out,
+                                    const DenseTensor &dout,
+                                    int axis,
+                                    DenseTensor *dx,
+                                    DenseTensor *dy,
+                                    DX_OP dx_op,
+                                    DY_OP dy_op) {
+  size_t N = static_cast<size_t>(phi::product(x_dim));
+  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
+      x.data<T>(),
+      y.data<T>(),
+      out.data<Tout>(),
+      dout.data<Tout>(),
+      dx_op,
+      dy_op,
+      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
+      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
+}
+
+#if defined(__NVCC__) || defined(__HIPCC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
@@ -114,7 +464,6 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
   }
 }
 
-#ifndef __xpu__
 template <typename T, typename OP, typename Tout = T>
 static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             const T *y,
@@ -1282,13 +1631,13 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
   std::vector<int> x_dims_array(max_dim);
   std::vector<int> y_dims_array(max_dim);
   std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
   // for inplace strategy. memset will make dx and dout clear and get wrong
   // result.
   if (dx && dx->IsSharedBufferWith(dout)) {
@@ -1340,37 +1689,37 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      phi::errors::InvalidArgument(
+      errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    phi::errors::InvalidArgument(
+                    errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
 
   int pre, n, post, is_run_common_broadcast, axis_trim = 0;
   if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
     axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
   } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
     axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
   }
   // special case for common backward implementation.
   if (is_run_common_broadcast) {
@@ -1408,228 +1757,33 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   }
 }
 
-/*
-******************************
-    Add Grad
-******************************
-*/
-
-template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(
-    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
-  int stride = GRID_NUM_X * BLOCK_NUM_X;
-  int loop = size / vec_size;
-  int remainder = size % vec_size;
-  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
-  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
-  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
-  float4 tmp_loop;
-
-  for (int i = tid; i < loop; i += stride) {
-    tmp_loop = dout_vec[i];
-    dx_vec[i] = tmp_loop;
-    dy_vec[i] = tmp_loop;
-  }
-
-  if (tid == loop && remainder != 0) {
-    T tmp_rem;
-    while (remainder) {
-      int idx = size - remainder;
-      remainder--;
-      tmp_rem = dout[idx];
-      dx[idx] = tmp_rem;
-      dy[idx] = tmp_rem;
-    }
-  }
-}
-
-template <typename T>
-void default_elementwise_add_grad(const GPUContext &ctx,
-                                  const DenseTensor &x,
-                                  const DenseTensor &y,
-                                  const DenseTensor &out,
-                                  const DenseTensor &dout,
-                                  DenseTensor *dx,
-                                  DenseTensor *dy,
-                                  int axis = -1) {
-  auto *dout_data = dout.data<T>();
-
-  // dx
-  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout.dims()) {
-      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout.dims()) {
-      if (dy_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
-      }
-    } else {
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
+#endif
 
-template <typename T>
-void elementwise_add_grad(const GPUContext &ctx,
-                          const DenseTensor &x,
-                          const DenseTensor &y,
-                          const DenseTensor &out,
-                          const DenseTensor &dout,
-                          DenseTensor *dx,
-                          DenseTensor *dy) {
-  auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  auto *dout_data = dout.data<T>();
-  if (dx_data == dout_data && dy_data != dout_data) {
-    VLOG(4) << "Special case when dx_data is the same as dout_data, "
-               "only need copy dout to dy";
-    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
-  } else if (dx_data != dout_data && dy_data == dout_data) {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "only need copy dout to dx";
-    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-  } else if (dx_data != dout_data && dy_data != dout_data) {
-    auto size = x.numel();
-    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
-                 PREDEFINED_BLOCK_SIZE,
-             1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        dout.data<T>(),
-        size,
-        vec_size,
-        dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradCompute(const DeviceContext &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &y,
+                         const DenseTensor &out,
+                         const DenseTensor &dout,
+                         int axis,
+                         DenseTensor *dx,
+                         DenseTensor *dy,
+                         DX_OP dx_op,
+                         DY_OP dy_op) {
+  const DDim &x_dim = x.dims();
+  const DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "and dx_data is the same as dout_data, do not need "
-               "any operator";
-  }
-}
-
-/*
-******************************
-    Sub Grad
-******************************
-*/
-
-template <typename T>
-static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
-                                                       int64_t size,
-                                                       T *dx,
-                                                       T *dy) {
-  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
-
-  while (col < size) {
-    if (dx != nullptr) {
-      dx[col] = dout[col];
-    }
-    dy[col] = -dout[col];
-    col += BLOCK_NUM_X * GRID_NUM_X;
-  }
-}
-
-template <typename T>
-void default_elementwise_sub_grad(const GPUContext &ctx,
-                                  const DenseTensor &x,
-                                  const DenseTensor &y,
-                                  const DenseTensor &out,
-                                  const DenseTensor &dout,
-                                  DenseTensor *dx,
-                                  DenseTensor *dy,
-                                  int axis = -1) {
-  auto *dout_data = dout.data<T>();
-  // dx
-  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout.dims()) {
-      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
+    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   }
-  // dy
-  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout.dims()) {
-      if (dy_data != dout_data) {
-        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-        auto size = dy->numel();
-        dim3 grid_size =
-            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-        SimpleElemwiseSubGradCUDAKernel<
-            T><<<grid_size, block_size, 0, ctx.stream()>>>(
-            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
-          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename T>
-void elementwise_sub_grad(const GPUContext &ctx,
-                          const DenseTensor &x,
-                          const DenseTensor &y,
-                          const DenseTensor &out,
-                          const DenseTensor &dout,
-                          DenseTensor *dx,
-                          DenseTensor *dy) {
-  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-  auto size = x.numel();
-  dim3 grid_size =
-      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-  SimpleElemwiseSubGradCUDAKernel<
-      T><<<grid_size, block_size, 0, ctx.stream()>>>(
-      dout.data<T>(),
-      size,
-      dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
 }
 
-#endif
-
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3790044346dc42ac8772505dbafdb076db615491
--- /dev/null
+++ b/paddle/phi/kernels/funcs/elementwise_utils.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+
+namespace funcs {
+
+using DDim = phi::DDim;
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
+ *
+ * New parameter: *is_run_common_broadcast* is a flag to record whether to run
+ * common broadcast code.
+ */
+inline void GetMidDims(const DDim &x_dims,
+                       const DDim &y_dims,
+                       const int axis,
+                       int *pre,
+                       int *n,
+                       int *post,
+                       int *is_run_common_broadcast) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  *is_run_common_broadcast = 0;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    if (x_dims[i + axis] != y_dims[i]) {
+      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Broadcast dimension mismatch. Operands "
+                            "could not be broadcast together with the shape of "
+                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
+                            "in X is not equal to [%d] in Y.",
+                            x_dims,
+                            y_dims,
+                            x_dims[i + axis],
+                            y_dims[i]));
+      *is_run_common_broadcast = 1;
+      return;
+    }
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+inline DDim TrimTrailingSingularDims(const DDim &dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size == dims.size()) return dims;
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim(phi::make_dim());
+  }
+  DDim actual_dims = phi::make_ddim(trim_dims);
+  return actual_dims;
+}
+
+inline int GetElementwiseIndex(const int *x_dims_array,
+                               const int max_dim,
+                               const int *index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+inline void UpdateElementwiseIndexArray(const int *out_dims_array,
+                                        const int max_dim,
+                                        int *index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
similarity index 62%
rename from paddle/fluid/operators/gather.cu.h
rename to paddle/phi/kernels/funcs/gather.cu.h
index fef425c53acf2d2bb6503ec1263299bf6bcfa107..6e31ab7f8c7c5996de8f0f624e27738820b4ab7c 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -13,24 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/memory/memcpy.h"
+// TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/utils/dim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
 
-using framework::Tensor;
-using platform::DeviceContext;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
+__global__ void GatherCUDAKernel(const T* params,
+                                 const IndexT* indices,
+                                 T* output,
+                                 size_t index_size,
                                  size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -42,9 +43,12 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
-                                   const IndexT* indices, T* output,
-                                   size_t remain_size, size_t slice_size,
+__global__ void GatherNdCUDAKernel(const T* input,
+                                   const int64_t* input_dims,
+                                   const IndexT* indices,
+                                   T* output,
+                                   size_t remain_size,
+                                   size_t slice_size,
                                    size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -59,7 +63,8 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater than or equal to 0, but received [%d]",
-          input_dims[j], index_value);
+          input_dims[j],
+          index_value);
       gather_i += (index_value * temp);
       temp *= input_dims[j];
     }
@@ -76,13 +81,16 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
+void GPUGather(const phi::GPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "If the index's rank of gather_op is 2,"
-                          " the second dimension should be 1."));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("If the index's rank of gather_op is 2,"
+                                     " the second dimension should be 1."));
   }
 
   // index size
@@ -90,7 +98,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   if (index_size == 0) return;
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -105,18 +113,17 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   int64_t n = slice_size * index_size;
   int64_t grid = (n + block - 1) / block;
 
-  GatherCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
-  const auto& ctx = context.template device_context<DeviceContext>();
+template <typename T, typename IndexT = int>
+void GPUGatherNd(const phi::GPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -143,29 +150,36 @@ void GPUGatherNd(const framework::ExecutionContext& context,
     v_input_dims[i] = input_dims[i];
   }
 
-  auto& dev_ctx = context.cuda_device_context();
+  phi::DenseTensor input_dims_tensor;
+  input_dims_tensor.Resize({input_dims_size});
+  auto* g_input_dims = ctx.Alloc<int64_t>(&input_dims_tensor);
   int64_t bytes = input_dims_size * sizeof(int64_t);
-  auto p_input_dims = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_input_dims = reinterpret_cast<int64_t*>(p_input_dims->ptr());
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
-               ctx.stream());
+
+  paddle::memory::Copy(
+      gplace, g_input_dims, cplace, v_input_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  GatherNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
-      end_size);
+  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
+                                                                  g_input_dims,
+                                                                  p_index,
+                                                                  p_output,
+                                                                  remain_numel,
+                                                                  slice_size,
+                                                                  end_size);
 }
 
 template <typename T, typename U>
-__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
-                                int64_t outer_dim_size, int64_t inner_dim_size,
+__global__ void GatherGPUKernel(const T* input,
+                                const U* index,
+                                T* out,
+                                int64_t outer_dim_size,
+                                int64_t inner_dim_size,
                                 int64_t out_index_dim_size,
-                                int64_t input_index_dim_size, int64_t size) {
+                                int64_t input_index_dim_size,
+                                int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   int64_t outer_size = outer_dim_size * out_index_dim_size;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
@@ -180,7 +194,8 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
         "please check whether the dimensions of index and "
         "input meet the requirements. It should "
         "be less than [%d] and greater than or equal to 0, but received [%d]",
-        input_index_dim_size, index_val);
+        input_index_dim_size,
+        index_val);
 
     int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
     int64_t input_index =
@@ -191,11 +206,14 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+__global__ void GatherGradGPUKernel(const T* input,
+                                    const U* index,
+                                    T* out,
                                     int64_t outer_dim_size,
                                     int64_t inner_dim_size,
                                     int64_t input_index_dim_size,
-                                    int64_t out_index_dim_size, int64_t size) {
+                                    int64_t out_index_dim_size,
+                                    int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
     int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
@@ -210,10 +228,11 @@ __global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place,
-                          const framework::ExecutionContext& ctx) {
+void GatherV2CUDAFunction(const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out,
+                          const phi::GPUContext& ctx) {
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
   auto input_dim = input->dims();
@@ -241,24 +260,31 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
   int64_t out_size = out->numel();
   if (out_size == 0) return;
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
+  auto stream = ctx.stream();
   GatherGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      index_size, index_dim_size, out_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      index_size,
+      index_dim_size,
+      out_size);
 }
 
 template <typename T, typename U>
-void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
-                              const int axis, Tensor* out,
-                              const paddle::platform::Place& place,
-                              const framework::ExecutionContext& ctx) {
+void GatherV2GradCUDAFunction(const DenseTensor* input,
+                              const DenseTensor* index,
+                              const int axis,
+                              DenseTensor* out,
+                              const phi::GPUContext& ctx) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -279,19 +305,25 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
+  auto stream = ctx.stream();
   GatherGradGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      input_index_dim_size, out_index_dim_size, input_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      input_index_dim_size,
+      out_index_dim_size,
+      input_size);
 }
-}  // namespace operators
-}  // namespace paddle
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/gather.h b/paddle/phi/kernels/funcs/gather.h
similarity index 72%
rename from paddle/fluid/operators/gather.h
rename to paddle/phi/kernels/funcs/gather.h
index 46f78b16ef36b7352aec095896661d3c270f329f..740042c999aa979479d80e2b5e57ca214223d8a0 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -17,16 +17,13 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
  * A thin wrapper for gathering on cpu tensor
@@ -36,22 +33,23 @@ using framework::Tensor;
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
+void CPUGather(const phi::CPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
     PADDLE_ENFORCE_EQ(
-        index.dims()[1], 1,
-        platform::errors::InvalidArgument(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument(
             "index.dims()[1] should be 1 when index.dims().size() = 2"
             "in gather_op, but received value is [%d].",
             index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in gather_op,"
                           "but received shape's size is [%d].",
                           index.dims().size()));
@@ -74,29 +72,32 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
 
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
-    PADDLE_ENFORCE_LT(p_index[i], input_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(p_index[i],
+                      input_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_size, p_index[i], i));
-    PADDLE_ENFORCE_GE(p_index[i], 0,
-                      platform::errors::OutOfRange(
+                          input_size,
+                          p_index[i],
+                          i));
+    PADDLE_ENFORCE_GE(p_index[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          p_index[i], i));
+                          p_index[i],
+                          i));
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
-                 const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
-
+void CPUGatherNd(const phi::CPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
   auto input_dims = input.dims();
@@ -124,25 +125,30 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_LT(
-          index_value, input_dims[j],
-          platform::errors::InvalidArgument(
+          index_value,
+          input_dims[j],
+          phi::errors::InvalidArgument(
               "Input(index[-1)] has wrong value, it is [%d]", index_value));
       PADDLE_ENFORCE_GE(
-          index_value, 0,
-          platform::errors::InvalidArgument(
+          index_value,
+          0,
+          phi::errors::InvalidArgument(
               "The value of Input(index) must be no less than 0"));
 
       index_ += (index_value * temp);
       temp *= input_dims[j];
     }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
-           slice_bytes);
+    memcpy(
+        p_output + i * slice_size, p_input + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename U>
-void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
-                      Tensor* out, const paddle::platform::Place& place) {
+void GatherV2Function(const phi::CPUContext& ctx,
+                      const DenseTensor* input,
+                      const DenseTensor* index,
+                      int axis,
+                      DenseTensor* out) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -154,18 +160,23 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 
   int64_t input_index_dim_size = input_dim[axis_index];
   for (int64_t i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_data[i],
+                      input_index_dim_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_index_dim_size, index_data[i], i));
-    PADDLE_ENFORCE_GE(index_data[i], 0,
-                      platform::errors::OutOfRange(
+                          input_index_dim_size,
+                          index_data[i],
+                          i));
+    PADDLE_ENFORCE_GE(index_data[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          index_data[i], i));
+                          index_data[i],
+                          i));
   }
 
   int64_t inner_dim_size = 1;
@@ -184,7 +195,7 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
 
   int out_index = 0;
   for (int64_t i = 0; i < inner_dim_size; i++) {
@@ -200,9 +211,11 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 }
 
 template <typename T, typename U>
-void GatherV2GradFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place) {
+void GatherV2GradFunction(const phi::CPUContext& ctx,
+                          const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out) {
   auto* index_data = index->data<U>();
 
   auto input_dim = input->dims();
@@ -222,11 +235,10 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
   for (int64_t i = 0; i < inner_dim_size; i++) {
     for (int64_t j = 0; j < input_index_dim_size; j++) {
@@ -239,5 +251,5 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f159fd28afbf601a563631abbcb62a955370f28
--- /dev/null
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/gru_compute.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size * 2,
+                frame_size,
+                1,
+                value.prev_out_value,
+                frame_size,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                value.gate_value,
+                frame_size * 3);
+    }
+
+    detail::forward_reset_output(
+        phi::funcs::detail::forward::gru_resetOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_gate,
+        true,
+        nullptr);
+
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.reset_output_value,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                value.gate_value + frame_size * 2,
+                frame_size * 3);
+    }
+
+    detail::forward_final_output(
+        phi::funcs::detail::forward::gru_finalOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_node,
+        origin_mode,
+        true,
+        nullptr);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    detail::backward_state_grad(
+        phi::funcs::detail::backward::gru_stateGrad<T>(),
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        active_node,
+        origin_mode);
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value && grad.prev_out_grad) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size * 2,
+                frame_size * 3,
+                value.state_weight,
+                frame_size,
+                0,
+                grad.reset_output_grad,
+                frame_size);
+
+      if (grad.state_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  value.reset_output_value,
+                  frame_size,
+                  grad.gate_grad + frame_size * 2,
+                  frame_size * 3,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+
+    detail::backward_reset_grad(
+        phi::funcs::detail::backward::gru_resetGrad<T>(),
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        active_gate);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size * 2,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+
+      if (grad.gate_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size * 2,
+                  batch_size,
+                  1,
+                  value.prev_out_value,
+                  frame_size,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size * 2);
+      }
+    }
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(CblasNoTrans,
+                CblasTrans,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.prev_out_value,
+                value.state_weight,
+                0,
+                value.reset_output_value);
+    }
+    detail::forward_reset_output(
+        phi::funcs::detail::forward::gru_resetOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_gate,
+        false,
+        &context);
+
+    T *cell_state_value = value.gate_value + 2 * frame_size;
+    T *reset_output_value = value.reset_output_value;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(
+          frame_size, cell_state_value, reset_output_value, cell_state_value);
+      cell_state_value += frame_size * 3;
+      reset_output_value += frame_size;
+    }
+
+    detail::forward_final_output(
+        phi::funcs::detail::forward::gru_finalOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_node,
+        true,
+        false,
+        &context);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    // calculate grad_update_gate, grad_frame_state,
+    // grad_reset_output, grad_reset_gate
+    detail::cpu_gru_backward(context,
+                             phi::funcs::detail::backward::gru<T>(),
+                             value,
+                             grad,
+                             frame_size,
+                             batch_size,
+                             active_node,
+                             active_gate);
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      // update prev_out_grad
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size,
+                frame_size * 3,
+                value.gate_weight + frame_size * frame_size,
+                frame_size,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.reset_output_grad,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+      // update weight_hh_grad
+      if (grad.gate_weight_grad) {
+        // reset gate
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  value.prev_out_value,
+                  frame_size,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size);
+        // update gate
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  grad.gate_grad + frame_size,
+                  frame_size * 3,
+                  value.prev_out_value,
+                  frame_size,
+                  1,
+                  grad.gate_weight_grad + frame_size * frame_size,
+                  frame_size);
+        // cell state
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  grad.reset_output_grad,
+                  frame_size,
+                  value.prev_out_value,
+                  frame_size,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+    // update bias_hh_grad
+    T *gate_grad = grad.gate_grad;
+    T *bias_hh_grad = grad.bias_hh_grad;
+    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
+    T *reset_output_grad = grad.reset_output_grad;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
+      blas.VADD(
+          frame_size, state_bias_grad, reset_output_grad, state_bias_grad);
+      gate_grad += 3 * frame_size;
+      reset_output_grad += frame_size;
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
+
+template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext,
+                                     double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7666206b7f7f4b6ba97a32f9d16ba63f6263f51c
--- /dev/null
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
@@ -0,0 +1,349 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/fluid/platform/device_context.h>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      if (context.GetComputeCapability() >= 70) {
+        if (frame_size < 16) {
+          constexpr int tiled_size = 8;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        } else {
+          constexpr int tiled_size = 16;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        }
+        return;
+      } else {
+        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+        int frame_blocks = (frame_size + 1024 - 1) / 1024;
+        threads = dim3(frame_per_block, 1);
+        grid = dim3(frame_blocks, 1);
+      }
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size * 2,
+                frame_size,
+                1,
+                value.prev_out_value,
+                frame_size,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                value.gate_value,
+                frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardResetOutput<
+          phi::funcs::detail::forward::gru_resetOutput<T>,
+          /* is_batch= */ false,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_resetOutput<T>(),
+          value.gate_value,
+          value.reset_output_value,
+          value.prev_out_value,
+          frame_size,
+          batch_size,
+          active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<
+          phi::funcs::detail::forward::gru_resetOutput<T>,
+          /* is_batch= */ true,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_resetOutput<T>(),
+          value.gate_value,
+          value.reset_output_value,
+          value.prev_out_value,
+          frame_size,
+          batch_size,
+          active_gate);
+    }
+
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.reset_output_value,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                value.gate_value + frame_size * 2,
+                frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardFinalOutput<
+          phi::funcs::detail::forward::gru_finalOutput<T>,
+          /* is_batch= */ false,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_finalOutput<T>(),
+          value.gate_value,
+          value.prev_out_value,
+          value.output_value,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    } else {
+      detail::KeGruForwardFinalOutput<
+          phi::funcs::detail::forward::gru_finalOutput<T>,
+          /* is_batch= */ true,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_finalOutput<T>(),
+          value.gate_value,
+          value.prev_out_value,
+          value.output_value,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardStateGrad<
+          phi::funcs::detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_stateGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.output_grad,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          phi::funcs::detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_stateGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.output_grad,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    }
+
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size * 2,
+                frame_size * 3,
+                value.state_weight,
+                frame_size,
+                0,
+                grad.reset_output_grad,
+                frame_size);
+
+      if (grad.state_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  value.reset_output_value,
+                  frame_size,
+                  grad.gate_grad + frame_size * 2,
+                  frame_size * 3,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardResetGrad<
+          phi::funcs::detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_resetGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.reset_output_grad,
+          frame_size,
+          batch_size,
+          active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          phi::funcs::detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_resetGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.reset_output_grad,
+          frame_size,
+          batch_size,
+          active_gate);
+    }
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size * 2,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+
+      if (grad.gate_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size * 2,
+                  batch_size,
+                  1,
+                  value.prev_out_value,
+                  frame_size,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.h b/paddle/phi/kernels/funcs/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..02b2b91423cfcab36eee6e9e80ed47d49270f028
--- /dev/null
+++ b/paddle/phi/kernels/funcs/gru_compute.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct GRUMetaValue {
+  const T *gate_weight;
+  const T *state_weight;
+  const T *reset_bias;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  const T *prev_out_value;
+};
+
+template <typename T>
+struct GRUMetaGrad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
+  T *bias_hh_grad;
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitFunctor {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitFunctorV2 {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctorV2 {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate);
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccb70fe25ddce3ec9fba984a86049213ac51e5fa
--- /dev/null
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Functor, int VecSize>
+__global__ void VectorizedIndexKernel(T *out,
+                                      size_t numel,
+                                      size_t main_offset,
+                                      Functor func) {
+  size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  size_t args[VecSize];
+  T result[VecSize];
+  for (; data_offset < main_offset; data_offset += stride) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+        &result[0], &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, false>(
+        out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
+  }
+  size_t num = numel - data_offset;
+  if (num > 0) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+        &result[0], &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+  }
+}
+
+template <typename T, typename Functor>
+void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
+  int numel = out->numel();
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  if (numel <= 0) return;
+  int vec_size = phi::GetVectorizedSize(out_data);
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  int grid = 8;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+  int grid = config.block_per_grid.x;
+  int block = config.thread_per_block.x;
+  auto stream = dev_ctx.stream();
+#endif
+  size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
+  switch (vec_size) {
+    case 4:
+      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 2:
+      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 1:
+      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    default: {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/phi/kernels/funcs/isfinite_functor.h
similarity index 52%
rename from paddle/fluid/operators/isfinite_v2_op.h
rename to paddle/phi/kernels/funcs/isfinite_functor.h
index b646e460ec75b6dfd1dad6f9875040ffc97c99de..c804bee8d4c68c55536a3c5841799887dc9e5bd2 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,39 +14,32 @@
 
 #pragma once
 
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace operators {
+namespace funcs {
 
 struct InfinityV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsInfV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsInfV2(tensor, out);
   }
 };
 
 struct NANV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsNANV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsNANV2(tensor, out);
   }
 };
 
 struct IsfiniteV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorIsfiniteV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorIsfiniteV2(tensor, out);
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
index ffff5ae8abe2a796fc66c971694cbe819494ab90..1a53470b2e6041948e48452356bfec8685f3e617 100644
--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
@@ -1 +1 @@
-math_library(lapack_function DEPS dynload_lapack)
+math_library(lapack_function DEPS phi_dynload_lapack)
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 0407b8fd48960df9b674e6537bbaed2e96ef3440..0f887dce4b4dab17643b202f87b588bc270f8d6d 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/fluid/platform/dynload/lapack.h"
+#include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
@@ -22,12 +22,12 @@ namespace funcs {
 // LU (for example)
 template <>
 void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 template <>
 void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 // eigh
@@ -47,7 +47,7 @@ void lapackEigh<float>(char jobz,
                        int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::ssyevd_(
+  dynload::ssyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -67,7 +67,7 @@ void lapackEigh<double>(char jobz,
                         int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::dsyevd_(
+  dynload::dsyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -86,20 +86,19 @@ void lapackEigh<phi::dtype::complex<float>, float>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::cheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::cheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<float> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 template <>
@@ -117,20 +116,19 @@ void lapackEigh<phi::dtype::complex<double>, double>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::zheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::zheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<double> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 // Eig
@@ -152,20 +150,20 @@ void lapackEig<double>(char jobvl,
   double *wr = w;
   double *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::dgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::dgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -186,20 +184,20 @@ void lapackEig<float>(char jobvl,
   float *wr = w;
   float *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::sgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::sgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -218,21 +216,20 @@ void lapackEig<phi::dtype::complex<double>, double>(
     int lwork,
     double *rwork,
     int *info) {
-  paddle::platform::dynload::zgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(w),
-      reinterpret_cast<std::complex<double> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<double> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::zgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<double> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<double> *>(w),
+                  reinterpret_cast<std::complex<double> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<double> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<double> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -251,21 +248,20 @@ void lapackEig<phi::dtype::complex<float>, float>(
     int lwork,
     float *rwork,
     int *info) {
-  paddle::platform::dynload::cgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<float> *>(w),
-      reinterpret_cast<std::complex<float> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<float> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::cgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<float> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<float> *>(w),
+                  reinterpret_cast<std::complex<float> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<float> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<float> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -280,8 +276,7 @@ void lapackGels<double>(char trans,
                         double *work,
                         int lwork,
                         int *info) {
-  paddle::platform::dynload::dgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -296,8 +291,7 @@ void lapackGels<float>(char trans,
                        float *work,
                        int lwork,
                        int *info) {
-  paddle::platform::dynload::sgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -316,20 +310,20 @@ void lapackGelsd<double>(int m,
                          double *rwork,
                          int *iwork,
                          int *info) {
-  paddle::platform::dynload::dgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::dgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -348,20 +342,20 @@ void lapackGelsd<float>(int m,
                         float *rwork,
                         int *iwork,
                         int *info) {
-  paddle::platform::dynload::sgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::sgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -379,7 +373,7 @@ void lapackGelsy<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelsy_(
+  dynload::dgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -398,7 +392,7 @@ void lapackGelsy<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelsy_(
+  dynload::sgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -417,7 +411,7 @@ void lapackGelss<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelss_(
+  dynload::dgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -436,7 +430,7 @@ void lapackGelss<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelss_(
+  dynload::sgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -450,15 +444,14 @@ void lapackCholeskySolve<phi::dtype::complex<double>>(
     phi::dtype::complex<double> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::zpotrs_(
-      &uplo,
-      &n,
-      &nrhs,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(b),
-      &ldb,
-      info);
+  dynload::zpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<double> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -471,14 +464,14 @@ void lapackCholeskySolve<phi::dtype::complex<float>>(
     phi::dtype::complex<float> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::cpotrs_(&uplo,
-                                     &n,
-                                     &nrhs,
-                                     reinterpret_cast<std::complex<float> *>(a),
-                                     &lda,
-                                     reinterpret_cast<std::complex<float> *>(b),
-                                     &ldb,
-                                     info);
+  dynload::cpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<float> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -490,7 +483,7 @@ void lapackCholeskySolve<double>(char uplo,
                                  double *b,
                                  int ldb,
                                  int *info) {
-  paddle::platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 template <>
@@ -502,7 +495,7 @@ void lapackCholeskySolve<float>(char uplo,
                                 float *b,
                                 int ldb,
                                 int *info) {
-  paddle::platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/logical_functor.h b/paddle/phi/kernels/funcs/logical_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ea7fc43e6b32c85e446044011b0c2ab3c79817c
--- /dev/null
+++ b/paddle/phi/kernels/funcs/logical_functor.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
+  template <typename T>                                      \
+  struct func_name {                                         \
+    using ELEMENT_TYPE = T;                                  \
+    HOSTDEVICE bool operator()(const T a, const T b) const { \
+      return static_cast<bool>(a) op static_cast<bool>(b);   \
+    }                                                        \
+  };
+
+LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T a) const { return !a; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cc b/paddle/phi/kernels/funcs/lstm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..19932c62b01ef391849dd32d918cef7704749bd1
--- /dev/null
+++ b/paddle/phi/kernels/funcs/lstm_compute.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/lstm_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <class T>
+struct LstmUnitFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(context,
+                               phi::funcs::detail::forward::lstm<T>(),
+                               value,
+                               frame_size,
+                               cell_clip,
+                               cand_act,
+                               gate_act,
+                               cell_act,
+                               old_api_version);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(context,
+                                phi::funcs::detail::backward::lstm<T>(),
+                                value,
+                                grad,
+                                frame_size,
+                                cell_clip,
+                                cand_act,
+                                gate_act,
+                                cell_act,
+                                old_api_version);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
+      }
+    }
+  }
+};
+
+template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cu b/paddle/phi/kernels/funcs/lstm_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2057cfc4f911c47039eee5f559d474519fac8f6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/lstm_compute.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/lstm_kernel.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+
+namespace phi {
+namespace funcs {
+
+template <class T>
+struct LstmUnitFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    detail::gpu_lstm_forward<T>(context,
+                                phi::funcs::detail::forward::lstm<T>(),
+                                value,
+                                frame_size,
+                                batch_size,
+                                cell_clip,
+                                cand_act,
+                                gate_act,
+                                cell_act);
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    detail::gpu_lstm_backward(context,
+                              phi::funcs::detail::backward::lstm<T>(),
+                              value,
+                              grad,
+                              frame_size,
+                              batch_size,
+                              cell_clip,
+                              cand_act,
+                              gate_act,
+                              cell_act);
+  }
+};
+
+template class LstmUnitFunctor<paddle::platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<paddle::platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/phi/kernels/funcs/lstm_compute.h
similarity index 56%
rename from paddle/fluid/operators/math/lstm_compute.h
rename to paddle/phi/kernels/funcs/lstm_compute.h
index cc91f784f3954019662a418ece7a667c15017ed2..d51b92fc4fd69447906c0d6ae55b71aa1e7a58b6 100644
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/phi/kernels/funcs/lstm_compute.h
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 template <class T>
 struct LstmMetaValue {
@@ -49,25 +48,31 @@ struct LstmMetaGrad {
 template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act,
+  static void compute(const DeviceContext &context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType &gate_act,
+                      const phi::funcs::detail::ActivationType &cell_act,
+                      const phi::funcs::detail::ActivationType &cand_act,
                       bool old_api_version = true);
 };
 
 template <typename DeviceContext, typename T>
 class LstmUnitGradFunctor {
  public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act,
+  static void compute(const DeviceContext &context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType &gate_act,
+                      const phi::funcs::detail::ActivationType &cell_act,
+                      const phi::funcs::detail::ActivationType &cand_act,
                       bool old_api_version = true);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index ae368a005f057994d9f2c4a91188358aa26e09c2..df2af82d551ee0521a5a5f78164bf87491367ad3 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -187,6 +187,57 @@ void TransposeNormal<DeviceContext, T>::operator()(
       in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank);
 }
 
+template <typename T>
+struct TransposeNormal<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& in,
+                  DenseTensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = stride(in.dims());
+    auto out_stride = stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const phi::GPUPlace& cuda_place = context.GetPlace();
+    phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
+    auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    paddle::memory::Copy(
+        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr,
+        out_ptr,
+        elements,
+        in_stride_ptr,
+        out_stride_ptr,
+        axis_ptr,
+        rank);
+  }
+};
+
 // define transpose normal
 #define DEFINE_GPU_TRANS_NORMAL(TYPE)                                         \
   template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 8e1a4cdd1a9688a12d7f0a8b5ba088f6abfc9512..b735587d3d53df03dfb82f7e3657a6b0c2cabd9b 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -125,5 +125,43 @@ struct TensorSetConstantXPU {
 };
 #endif
 
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c95e97f8ea81a566972c162663b100ad562608b1
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+}
+
+template class MatrixInverseFunctor<CPUContext, float>;
+template class MatrixInverseFunctor<CPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..686b8405bf7502c58bc07dbfa6d0542401853c0e
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data;
+  const T* gpu_mat = a.data<T>();
+  if (n >= 32) {
+    // Copy all elements of input matrix A to a temporary memory space to
+    // avoid being overriden by getrf.
+    tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T));
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_mat_data->ptr(),
+                         dev_ctx.GetPlace(),
+                         a.data(),
+                         a.numel() * sizeof(T),
+                         dev_ctx.stream());
+    gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
+  }
+
+  std::vector<const T*> cpu_ptrs(batch_size * 2);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_ptrs[i] = gpu_mat + i * n * n;
+    cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
+  }
+
+  // Copy the addresses of A and A_inv from host to device.
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_gpu_ptrs_data->ptr(),
+                       phi::CPUPlace(),
+                       static_cast<void*>(cpu_ptrs.data()),
+                       cpu_ptrs.size() * sizeof(T*),
+                       dev_ctx.stream());
+  T** gpu_inv_ptrs =
+      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+  // Allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
+      paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int));
+  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  std::vector<int> info;  // only for singular checking
+  info.resize(batch_size);
+  // This functions in cuBLAS is intended to be used for matrices of small
+  // sizes where the launch overhead is a significant factor.
+  // TODO(Xreki): call function in cusolver for large matrices.
+  if (n < 32) {
+    // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
+    // plus cublas<S/D>getriBatched.
+    // However it only works if N is less than 32. If not, we need to
+    // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
+    blas.BatchedMatInv(n,
+                       reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                       gpu_inv_ptrs,
+                       gpu_info_ptr,
+                       batch_size);
+  } else {
+    // This function performs the LU factorization of each matrix A by the
+    // equation P * A = L * U. L and U are written back to original matrix A,
+    // and diagonal elements of L are discarded.
+    int* gpu_pivot_ptr =
+        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    blas.BatchedGETRF(n,
+                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_info_ptr,
+                      batch_size);
+
+    blas.BatchedGETRI(n,
+                      reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_inv_ptrs,
+                      gpu_info_ptr,
+                      batch_size);
+  }
+  paddle::memory::Copy(phi::CPUPlace(),
+                       info.data(),
+                       dev_ctx.GetPlace(),
+                       gpu_info_ptr,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  for (int i = 0; i < batch_size; ++i) {
+    PADDLE_ENFORCE_EQ(info[i],
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U. "
+                          "Please check the matrix value and change it to a "
+                          "non-singular matrix",
+                          i,
+                          info[i],
+                          info[i]));
+  }
+#else
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+#endif
+}
+
+template class MatrixInverseFunctor<GPUContext, float>;
+template class MatrixInverseFunctor<GPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
+                                    double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
similarity index 61%
rename from paddle/fluid/operators/math/matrix_inverse.h
rename to paddle/phi/kernels/funcs/matrix_inverse.h
index fb58b483666526c0b7e745d1e49e308235871d57..c5b04a8106561962b6916907d86450a63c763830 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,17 +17,18 @@ limitations under the License. */
 #include <string>
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
-template <typename DeviceContext, typename T>
-void compute_inverse_eigen(const DeviceContext& context,
-                           const framework::Tensor& a,
-                           framework::Tensor* a_inv) {
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void ComputeInverseEigen(const Context& dev_ctx,
+                         const DenseTensor& a,
+                         DenseTensor* a_inv) {
   using Matrix =
       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using EigenMatrixMap = Eigen::Map<Matrix>;
@@ -38,7 +39,7 @@ void compute_inverse_eigen(const DeviceContext& context,
   int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
 
   const T* a_ptr = a.data<T>();
-  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+  T* a_inv_ptr = a_inv->mutable_data<T>(dev_ctx.GetPlace());
 
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
@@ -47,20 +48,20 @@ void compute_inverse_eigen(const DeviceContext& context,
     lu.compute(mat);
 
     const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-    PADDLE_ENFORCE_GT(
-        min_abs_pivot, static_cast<T>(0),
-        platform::errors::InvalidArgument("Input is not invertible."));
+    PADDLE_ENFORCE_GT(min_abs_pivot,
+                      static_cast<T>(0),
+                      errors::InvalidArgument("Input is not invertible."));
     mat_inv.noalias() = lu.inverse();
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename Context, typename T>
 class MatrixInverseFunctor {
  public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  framework::Tensor* a_inv);
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& a,
+                  DenseTensor* a_inv);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..849fd7a0075a89cedeab4d87c779931f2a14f115
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int64_t> in_dims = phi::vectorize<int64_t>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int64_t> out_dims = phi::vectorize<int64_t>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int64_t> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    phi::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
+        dev_ctx, in, out, out_reduce_dims, true, false);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, CPUContext>;
+template class MatrixReduceSumFunctor<double, CPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5c3ebd6bb01671eab670b477c0d97a962b2eaea0
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int> in_dims = phi::vectorize<int>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int> out_dims = phi::vectorize<int>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, in, out, kps::IdentityFunctor<T>(), out_reduce_dims);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, GPUContext>;
+template class MatrixReduceSumFunctor<double, GPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.h b/paddle/phi/kernels/funcs/matrix_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..22bddacd43d437e731ff5baf3e3c18c52dc55fd6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
+
+// Use For Matrix OP, reduce_sum 'in' according to out's dim
+// for example: in's dim = [5, 3, 2, M, N] ; out's dim = [3, 1, M, N]
+// axis [0, 2] of DenseTensor 'in' will be reduced
+template <typename T, typename Context>
+class MatrixReduceSumFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out);
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/phi/kernels/funcs/padding.h
similarity index 67%
rename from paddle/fluid/operators/math/padding.h
rename to paddle/phi/kernels/funcs/padding.h
index 529d39c9ba50f016434b0b14c4d85c84483bad7f..e2c4e766b605b570463da12c39c456923c439916 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,21 +15,26 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using EigenTensor = EigenTensor<T, D, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context,
-                 const std::vector<int>& pads, const framework::Tensor& src,
-                 T pad_value, framework::Tensor* out) {
+void PadFunction(const DeviceContext& context,
+                 const std::vector<int>& pads,
+                 const DenseTensor& src,
+                 T pad_value,
+                 DenseTensor* out) {
   std::array<std::pair<int64_t, int64_t>, D> paddings;
 
   for (size_t i = 0; i < paddings.size(); ++i) {
@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context,
   auto src_tensor = EigenTensor<T, D>::From(src);
   auto out_tensor = EigenTensor<T, D>::From(*out);
 
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *(context.eigen_device());
   EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
       place, out_tensor, src_tensor, paddings, pad_value);
 }
 
 template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context,
-                     const std::vector<int>& pads, const framework::Tensor& src,
-                     framework::Tensor* d_out) {
+void PadGradFunction(const DeviceContext& context,
+                     const std::vector<int>& pads,
+                     const DenseTensor& src,
+                     DenseTensor* d_out) {
   std::array<std::pair<int64_t, int64_t>, D> paddings;
   for (size_t i = 0; i < paddings.size(); ++i) {
     paddings[i].first = -pads[i * 2];
@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context,
 
   auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
   auto src_tensor = EigenTensor<T, D>::From(src);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *(context.eigen_device());
   EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
       place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
 }
 
 template <typename DeviceContext, typename T>
-void PaddingFunctor(int rank, const framework::ExecutionContext& context,
-                    const std::vector<int>& pads, T pad_value,
-                    const framework::Tensor& src, framework::Tensor* out) {
+void PaddingFunctor(int rank,
+                    const DeviceContext& context,
+                    const std::vector<int>& pads,
+                    T pad_value,
+                    const DenseTensor& src,
+                    DenseTensor* out) {
   switch (rank) {
     case 1:
       PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
       PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "PadOp only support tensors with no more"
-          " than 6 dimensions currently."));
+      PADDLE_THROW(
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
+                                     " than 6 dimensions currently."));
   }
 }
 
 template <typename DeviceContext, typename T>
-void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
+void PaddingGradFunctor(int rank,
+                        const DeviceContext& context,
                         const std::vector<int>& pads,
-                        const framework::Tensor& src, framework::Tensor* out) {
+                        const DenseTensor& src,
+                        DenseTensor* out) {
   switch (rank) {
     case 1:
       PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
@@ -118,9 +127,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
       PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "PadOp only support tensors with no more"
-          " than 6 dimensions currently."));
+      PADDLE_THROW(
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
+                                     " than 6 dimensions currently."));
   }
 }
 
@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads,
   }
   return is_sys_pad;
 }
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..5834f091d9a4de02afe7488ededc0189ae6f21d0
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -0,0 +1,1239 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+#include "paddle/utils/string/string_helper.h"
+
+// Reduce split or not, Whether to use ReduceHigherDim
+#define REDUCE_SPLIT_BOUNDARY 512
+#define REDUCE_VEC_SIZE 4
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+
+namespace details {
+
+static inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
+
+// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
+static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
+                                             const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+// get blockDim for reduceLastDim and reduceAny
+static inline int GetBlockDim(int block_dim) {
+  return block_dim >= kps::details::kReduceMaxThread
+             ? kps::details::kReduceMaxThread
+             : GetLastPow2(block_dim);
+}
+
+// check reduce rand is valid
+static inline void CheckReduceRank(int reduce_rank, int rank) {
+  if (rank % 2 == 0) {
+    PADDLE_ENFORCE_EQ(reduce_rank,
+                      rank / 2,
+                      phi::errors::InvalidArgument(
+                          "ReduceOp: invalid reduce rank. When rank = %d, "
+                          "reduce_rank must be %d, but got %d.",
+                          rank,
+                          rank / 2,
+                          reduce_rank));
+  } else {
+    auto lower_rank = (rank - 1) / 2;
+    auto upper_rank = (rank + 1) / 2;
+    PADDLE_ENFORCE_EQ(
+        reduce_rank == lower_rank || reduce_rank == upper_rank,
+        true,
+        phi::errors::InvalidArgument(
+            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
+            "must be %d or %d, but got %d.",
+            rank,
+            lower_rank,
+            upper_rank,
+            reduce_rank));
+  }
+}
+
+// convert dims from vector to array
+template <typename T, size_t ElementCount, typename VectorLikeType>
+static inline phi::Array<T, ElementCount> VectorToArray(
+    const VectorLikeType& vec) {
+  PADDLE_ENFORCE_LE(
+      vec.size(),
+      ElementCount,
+      phi::errors::InvalidArgument("Cub reduce Array: size not match. Received "
+                                   "vec.size() %d > ElementCount %d.",
+                                   vec.size(),
+                                   ElementCount));
+  size_t n = static_cast<size_t>(vec.size());
+  phi::Array<T, ElementCount> ret;
+  for (size_t i = 0; i < n; ++i) {
+    ret[i] = vec[i];
+  }
+  return ret;
+}
+
+static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
+                                            int dim_size,
+                                            bool reduce_all) {
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    reduce_dims.resize(dim_size);
+    int reduce_size = reduce_dims.size();
+    for (int i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = i;
+    }
+  } else {
+    for (auto e : dims) {
+      PADDLE_ENFORCE_LT(e,
+                        dim_size,
+                        phi::errors::InvalidArgument(
+                            "ReduceOp: invalid axis, when x_dims is %d, "
+                            "axis[i] should less than x_dims, but got %d.",
+                            dim_size,
+                            e));
+      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
+    }
+  }
+  return reduce_dims;
+}
+
+}  // namespace details
+
+constexpr int kMaxRank = phi::DDim::kMaxRank;
+
+enum ReduceType {
+  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
+  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
+  kReduceAny = 0x03,        // when reduce_dim.size() > 1
+};
+
+struct IndexCalculator {
+  IndexCalculator(int dim,
+                  const std::vector<int>& cal_dims,
+                  const std::vector<int>& cal_strides,
+                  const std::vector<int>& full_strides)
+      : dim(dim) {
+    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
+    strides = details::VectorToArray<int, kMaxRank>(full_strides);
+    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
+#ifndef PADDLE_WITH_XPU_KP
+    std::vector<paddle::platform::FastDivMod> cal_divmoders;
+    // fast divmod
+    for (auto i : cal_strides) {
+      cal_divmoders.push_back(paddle::platform::FastDivMod(i));
+    }
+    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
+        cal_divmoders);
+#endif
+  }
+
+  __device__ inline int operator()(int offset) const {
+#ifdef PADDLE_WITH_XPU_KP
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      index += (offset / reduce_strides[i]) * strides[dims[i]];
+      offset = offset % reduce_strides[i];
+    }
+    return index;
+#else
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      auto divmod = divmoders[i].Divmod(offset);
+      index += (divmod.val[0] * strides[dims[i]]);
+      offset = divmod.val[1];
+    }
+    return index;
+#endif
+  }
+
+  int dim;
+  phi::Array<int, kMaxRank> dims;
+  phi::Array<int, kMaxRank> strides;
+  phi::Array<int, kMaxRank> reduce_strides;
+#ifndef PADDLE_WITH_XPU2
+  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+#endif
+};
+
+template <bool ReduceLastDim = false>
+struct ReduceIndexMapping {
+  const kps::DimConfig dim;
+  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
+      : dim(dims) {}
+
+  __device__ __forceinline__ int BlockIdX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    } else {
+      return cluster_id() % dim.split_num_x;
+    }
+#else
+    return blockIdx.x;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockIdY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() % dim.split_num_x);
+    } else {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    }
+#else
+    return blockIdx.y;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockDimX() {
+#ifdef PADDLE_WITH_XPU2
+    return dim.deal_size_x;
+#else
+    return blockDim.x;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockDimY() {
+#ifdef PADDLE_WITH_XPU2
+    return 1;
+#else
+    return blockDim.y;
+#endif
+  }
+
+  __device__ __forceinline__ int GridDimX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_y;
+    } else {
+      return dim.split_num_x;
+    }
+#else
+    return gridDim.x;
+#endif
+  }
+
+  __device__ __forceinline__ int GridDimY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_x;
+    } else {
+      return dim.split_num_y;
+    }
+#else
+    return gridDim.y;
+#endif
+  }
+
+  __device__ __forceinline__ int GetLoopSize() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.deal_size_y;
+    } else {
+      return dim.deal_size_x;
+    }
+#else
+    return 1;
+#endif
+  }
+};
+
+// when reduce_type == kReduceLastDim this struct will be used
+// for higher performance
+struct OneDimIndexCal {
+  explicit OneDimIndexCal(int num) : stride(num) {}
+
+  __device__ inline int operator()(int index) const { return index * stride; }
+  int stride;
+};
+
+// reduce config
+template <typename Ty>
+struct ReduceConfig {
+  ReduceConfig(const std::vector<int>& origin_reduce_dims,
+               const std::vector<int>& origin_x_dim)
+      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
+
+  // get the parameters of reduceKernel
+  void Run() {
+    // step1: update the reduce_dim left_dim and x_dim
+    SetReduceDim();
+
+    // step2: get the strides of dim for reduceAny and reduceLastDim
+    SetStrides();
+
+    // step3: get the type of reduce
+    SetReduceType();
+
+    // step4: set the block and grid for launch kernel
+    SetBlockDim();
+  }
+
+  // when should_reduce_again is true, we need malloc temp space for temp data
+  void SetOutputData(Ty* y_data,
+                     const phi::GPUContext& dev_ctx,
+                     phi::DenseTensor* tmp) {
+    if (should_reduce_again) {
+      tmp->Resize(phi::make_ddim(
+          {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
+      output_data = dev_ctx.Alloc<Ty>(tmp);
+    } else {
+      output_data = y_data;
+    }
+  }
+
+ private:
+  // set reduce_dim, left_dim and update x_dim
+  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
+  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
+  void SetReduceDim() {
+    std::set<int> reduce_set;
+    for (auto e : reduce_dims_origin) {
+      auto pos = e >= 0 ? e : e + x_dim.size();
+      reduce_set.insert(pos);
+    }
+
+    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
+    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
+
+    // update reduce_dim and x_dim
+    std::vector<int> x_new_dim;
+
+    reduce_dim.push_back(reduce_dim_temp[0]);
+    x_new_dim.push_back(x_dim[0]);
+
+    int idx_reduce = 1;
+    int num = 0;
+
+    if (reduce_dim_temp.size() > 1) {
+      for (int i = 1; i < x_dim.size(); i++) {
+        if ((idx_reduce < reduce_dim_temp.size()) &&
+            (i == reduce_dim_temp[idx_reduce])) {
+          int result =
+              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
+          bool is_equal = ((result - num) == 1);
+          if (is_equal) {
+            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+            num++;
+          } else {
+            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
+            x_new_dim.push_back(x_dim[i]);
+          }
+          idx_reduce++;
+        } else {
+          x_new_dim.push_back(x_dim[i]);
+        }
+      }
+    } else {
+      x_new_dim = x_dim;
+    }
+
+    // update x_dim
+    x_dim = x_new_dim;
+    std::vector<int>().swap(x_new_dim);
+
+    std::vector<int> reduce_dim_new;
+    int is_reduced = 0;
+    for (auto e : reduce_dim) {
+      is_reduced |= 1 << e;
+    }
+
+    std::vector<int>().swap(reduce_dim);
+
+    for (int i = 0; i < x_dim.size(); i++) {
+      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+        x_new_dim.push_back(x_dim[i]);
+        if ((is_reduced >> i) & 1)
+          reduce_dim_new.push_back(x_new_dim.size() - 1);
+      } else {
+        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+      }
+    }
+
+    x_dim = x_new_dim;
+    reduce_dim = reduce_dim_new;
+
+    int x_rank = static_cast<int>(x_dim.size());
+    std::set<int> left_set;
+
+    for (int i = 0; i < x_rank; ++i) {
+      left_set.insert(i);
+    }
+
+    for (auto e : reduce_dim) {
+      left_set.erase(e);
+    }
+
+    left_dim.assign(left_set.begin(), left_set.end());
+
+    // if the last dim gets involved in reduction
+    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
+  }
+
+  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
+  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
+  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
+  //     left_strides = [1]
+  void SetStrides() {
+    std::vector<int> idx_dim;
+    for (int i = 0; i < x_dim.size(); i++) {
+      idx_dim.push_back(i);
+    }
+
+    x_strides = details::GetDimStrides(x_dim, idx_dim);
+    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
+    left_strides = details::GetDimStrides(x_dim, left_dim);
+    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+
+    left_num = 1;
+    if (left_dim.size()) {
+      left_num = left_strides[0] * x_dim[left_dim[0]];
+    }
+  }
+
+  // get the reduceType
+  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
+  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
+  //     x_dim = [8] reduce_dim = [0] --> reduceAll
+  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
+  void SetReduceType() {
+    int rank = x_dim.size();
+    int reduce_rank = reduce_dim.size();
+    bool is_last_dim =
+        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
+    if (rank == reduce_rank || is_last_dim) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
+      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
+    } else if (reduce_rank == 1) {
+// ReduceFirstDim and reduceSecondDim
+#ifdef PADDLE_WITH_XPU_KP
+      if (reduce_dim[0] == 0) {
+        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+      } else {
+        reduce_type = static_cast<int>(ReduceType::kReduceAny);
+      }
+#else
+      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+#endif
+    } else {
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+    }
+  }
+
+#ifndef PADDLE_WITH_XPU_KP
+  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
+    constexpr int min_reduce_num_per_thread = 16;
+    constexpr int max_reduce_num_per_thread = 256;
+    constexpr int max_num_threads = kps::details::kReduceMaxThread;
+
+    // set block size.
+    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
+    //    will process the reduction for one output.
+    //    The number of output for one block is blockDim.y;
+    // 2. If reduce_last_dim == false, different threadIdx.x will process
+    //    different reduction and gets the output separately. If it is
+    //    necessary, it should reduce in block y.
+    //    The number of output for one block is blockDim.x;
+    int block_x, block_y;
+    int grid_num, reduce_num_per_thread;
+    if (reduce_last_dim) {
+      block_x = details::GetBlockDim(reduce_num);
+      block_y = details::GetBlockDim(left_num);
+      block_dim->x = block_x;
+      block_dim->y =
+          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
+      grid_num = details::AlignUp(left_num, block_dim->y);
+      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
+    } else {
+      block_x = details::GetBlockDim(left_num);
+      block_y = details::GetBlockDim(reduce_num);
+      block_dim->x = std::min(block_x, 32);
+      block_dim->y =
+          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
+      block_dim->x =
+          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
+      grid_num = details::AlignUp(left_num, block_dim->x);
+      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
+    }
+    int device_id = paddle::platform::GetCurrentDeviceId();
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int max_threads_per_mp =
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+    int max_threads = max_threads_per_mp * max_mp;
+    int num_threads = block_dim->x * block_dim->y;
+    int max_num_blocks = max_threads / num_threads;
+
+    // set grid size.
+    // Whether to set grid.y larger than 1, there are 3 following rules:
+    // 1. The number that each thread process should no less than
+    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
+    // 2. It should maximize the utilization of SM.
+    // So we choose the minimum between input_split_num_1 and input_split_num_3
+    // to make each thread process as mush data as possible. Meanwhile,
+    // the number cannot be larger than max_reduce_num_per_thread, so we
+    // choose the maximum between the result above and input_split_num_2.
+    int input_split_num_1 =
+        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
+    int input_split_num_2 =
+        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
+    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
+
+    grid_dim->x = grid_num;
+    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
+                           input_split_num_2);
+    // if grid.y > 1, we need launch reduce kernel again.
+    if (grid_dim->y > 1) {
+      should_reduce_again = true;
+    }
+  }
+
+  // set block and grid for launch kernel
+  // for ReduceHigherDim: if block is enough -> splite reduce_num
+  //                     else init block(32, 1) grid(block_num, 1)
+  // for others: block(block_num, 1) , grid(left_num, 1)
+  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
+    int last_dim_num = x_dim.back();
+    // update left_num
+    int grid_z = left_num / last_dim_num;
+    left_num = last_dim_num;
+    grid_dim->z = grid_z;
+    int device_id = paddle::platform::GetCurrentDeviceId();
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int max_threads_per_mp =
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+    int max_threads = max_threads_per_mp * max_mp;
+    // init
+    int num_block = (max_threads / left_num);
+    block_dim->x = details::GetBlockDim(left_num);
+    grid_dim->x = details::AlignUp(left_num, block_dim->x);
+    blocking_size = reduce_num;
+
+    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
+      blocking_size = details::GetLastPow2(reduce_num / num_block);
+      if (blocking_size <= 1) {
+        blocking_size = details::GetLastPow2(sqrt(reduce_num));
+      } else if (blocking_size * 2 < reduce_num) {
+        blocking_size *= 2;
+      }
+      should_reduce_again = true;
+      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
+    }
+  }
+#endif
+
+  void SetBlockDim() {
+    // init
+    int block_num = details::GetBlockDim(reduce_num);
+    should_reduce_again = false;
+    dim3 block_dim(block_num, 1, 1);
+    dim3 grid_dim(left_num, 1, 1);
+    blocking_size = reduce_num;
+#ifdef PADDLE_WITH_XPU_KP
+    if (reduce_last_dim) {
+      block_dim.x = 64;
+      block_dim.y = reduce_num;
+      grid_dim.x = 1;
+      grid_dim.y = 8;
+    } else {
+      block_dim.x = 64;
+      block_dim.y = left_num;
+      grid_dim.x = 8;
+      grid_dim.y = 1;
+    }
+#else
+    if (reduce_type == ReduceType::kReduceHigherDim) {
+      SetBlockDimForHigher(&block_dim, &grid_dim);
+    } else {
+      SetBlockDimForReduceAny(&block_dim, &grid_dim);
+    }
+#endif
+
+    block = block_dim;
+    grid = grid_dim;
+  }
+
+ public:
+  std::vector<int> reduce_dims_origin;
+  std::vector<int> reduce_dim;
+  std::vector<int> x_dim;
+  std::vector<int> left_dim;
+  std::vector<int> x_strides;
+  std::vector<int> left_strides;
+  std::vector<int> reduce_strides;
+
+  int reduce_type;
+  int reduce_num;
+  int left_num;
+  int blocking_size;
+  bool should_reduce_again;
+  bool reduce_last_dim;
+
+  Ty* output_data;
+
+  dim3 block;
+  dim3 grid;
+};
+
+// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+// function will be used
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp,
+          typename Calculator>
+__global__ void ReduceAnyKernel(const Tx* x,
+                                Ty* y,
+                                ReduceOp reducer,
+                                TransformOp transformer,
+                                MPType init,
+                                int reduce_num,
+                                int left_num,
+                                bool reduce_last_dim,
+                                const Calculator reduce_index_calculator,
+                                const Calculator left_index_calculator,
+                                const kps::DimConfig dim) {
+  int input_idx, left_idx, stride;
+  int block_size = 0;
+  bool need_store = true;
+  int loop_left = 0;
+  int tid = 0;
+  // the last dim gets involved in reduction
+  int store_offset = 0;
+  int stride_left = 0;
+  if (reduce_last_dim) {
+    auto block = ReduceIndexMapping<true>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimX();
+    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
+    stride = block.GridDimY() * block.BlockDimX();
+    block_size = block.BlockDimX();
+    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = 1;
+    tid = THREAD_ID_X;
+  } else {
+    auto block = ReduceIndexMapping<false>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimY();
+    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
+    stride = block.GridDimY() * block.BlockDimY();
+    block_size = block.BlockDimY();
+    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = block.BlockDimX() * block.GridDimX();
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    tid = THREAD_ID_Y;
+  }
+  // calculate the offset, means the addr where each thread really start.
+  // 1. reduce for each thread
+  MPType input_compute[REDUCE_VEC_SIZE];
+  Tx input_reg[REDUCE_VEC_SIZE];
+  int input_idx_tmp = input_idx;
+  for (int i = 0; i < loop_left; i += stride_left) {
+    int input_offset = left_index_calculator(left_idx + i);
+    const _ptr_ Tx* input = x + input_offset;
+    MPType reduce_var = init;
+    // load REDUCE_VEC_SIZE data once, and then compute
+    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    input_idx = input_idx_tmp;
+    for (; input_idx + block_size < bound;
+         input_idx += REDUCE_VEC_SIZE * stride) {
+      kps::ReadDataReduce<Tx,
+                          Tx,
+                          1,
+                          REDUCE_VEC_SIZE,
+                          1,
+                          1,
+                          Calculator,
+                          kps::IdentityFunctor<Tx>,
+                          false>(&input_reg[0],
+                                 input,
+                                 input_idx,
+                                 reduce_index_calculator,
+                                 1,
+                                 reduce_num,
+                                 1,
+                                 stride,
+                                 kps::IdentityFunctor<Tx>(),
+                                 reduce_last_dim);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &input_compute[0], &input_reg[0], transformer);
+      kps::Reduce<MPType,
+                  REDUCE_VEC_SIZE,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+    }
+
+    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
+    kps::ReadDataReduce<Tx,
+                        MPType,
+                        1,
+                        REDUCE_VEC_SIZE,
+                        1,
+                        1,
+                        Calculator,
+                        TransformOp,
+                        true>(&input_compute[0],
+                              input,
+                              input_idx,
+                              reduce_index_calculator,
+                              1,
+                              reduce_num - input_idx,
+                              1,
+                              stride,
+                              transformer,
+                              reduce_last_dim);
+    kps::Reduce<MPType,
+                REDUCE_VEC_SIZE,
+                1,
+                1,
+                ReduceOp,
+                kps::details::ReduceMode::kLocalMode>(
+        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+
+    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
+        &reduce_var, &reduce_var, reducer, reduce_last_dim);
+    if (need_store) {
+      y[store_offset + i] = static_cast<Ty>(reduce_var);
+    }
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+__global__ void ReduceHigherDimKernel(const Tx* x,
+                                      Ty* y,
+                                      ReduceOp reducer,
+                                      TransformOp transformer,
+                                      MPType init,
+                                      int reduce_num,
+                                      int left_num,
+                                      int blocking_size,
+                                      const kps::DimConfig dim) {
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  auto block = ReduceIndexMapping<false>(dim);
+  int idy = block.BlockIdY() * blocking_size;
+  int idx = block.BlockIdX() * block.BlockDimX();
+  int idz = BLOCK_ID_Z * left_num;
+  int stride = dim.split_num_x * dim.deal_size_x;
+  int size = left_num - dim.rem_x;
+  int loop_size = min(reduce_num - idy, blocking_size);
+  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
+  int block_offset = idy * left_num + idz * reduce_num;
+  const _ptr_ Tx* input = x + block_offset;
+  Tx reduce_input;
+  for (; idx < size; idx += stride) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
+                                            input + loop_idx * left_num + idx,
+                                            block.BlockDimX(),
+                                            1,
+                                            1,
+                                            left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, false>(
+        y + store_offset + idx, &result, block.BlockDimX());
+  }
+
+  if (idx < left_num) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
+                                           input + loop_idx * left_num + idx,
+                                           dim.rem_x,
+                                           1,
+                                           1,
+                                           left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, true>(
+        y + store_offset + idx, &result, dim.rem_x);
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+static void LaunchReduceKernel(const Tx* x_data,
+                               Ty* y_data,
+                               const ReduceOp& reducer,
+                               const TransformOp& transform,
+                               MPType init,
+                               KPStream stream,
+                               ReduceConfig<Ty> config) {
+  if (config.reduce_type == kReduceLastDim) {
+    int stride_reduce = 1;
+    int stride_left = config.reduce_num;
+    // for higher performance
+    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
+    auto left_index_calculator = OneDimIndexCal(stride_left);
+
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.block.y,
+                                        0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#else
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#endif
+
+  } else {
+    int reduce_rank = config.reduce_strides.size();
+    int left_rank = config.left_strides.size();
+    auto reduce_index_calculator = IndexCalculator(reduce_rank,
+                                                   config.reduce_dim,
+                                                   config.reduce_strides,
+                                                   config.x_strides);
+    auto left_index_calculator = IndexCalculator(
+        left_rank, config.left_dim, config.left_strides, config.x_strides);
+
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.block.y,
+                                        0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#else
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#endif
+  }
+
+  if (config.should_reduce_again) {
+    dim3 block;
+    dim3 grid;
+    if (config.reduce_last_dim) {
+      block = dim3(32, 1, 1);
+      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
+    } else {
+      block = dim3(config.block.x, 1, 1);
+      grid = dim3(config.grid.x, 1, config.grid.z);
+    }
+
+    auto last_index = OneDimIndexCal(1);
+    auto first_index = OneDimIndexCal(config.left_num);
+    kps::DimConfig dim =
+        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+    dim.SetRem(config.left_num % block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
+        config.output_data,
+        y_data,
+        reducer,
+        kps::IdentityFunctor<Ty, MPType>(),
+        init,
+        config.grid.y,
+        config.left_num,
+        config.grid.y,
+        dim);
+#else
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+        config.output_data,
+        y_data,
+        reducer,
+        kps::IdentityFunctor<Ty, MPType>(),
+        init,
+        config.grid.y,
+        config.left_num,
+        config.grid.y,
+        dim);
+#endif
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const phi::GPUContext& dev_ctx,
+                    KPStream stream) {
+  auto reducer = ReduceOp<Ty>();
+  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
+                                                                  transform);
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Reduce(nullptr,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+  phi::DenseTensor tmp = phi::Empty<uint8_t, phi::GPUContext>(
+      dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
+
+  auto* temp_storage = dev_ctx.Alloc<uint8_t>(&tmp);
+
+  cub::DeviceReduce::Reduce(temp_storage,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const phi::GPUContext& dev_ctx,
+                    KPStream stream) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+void ReduceKernel(const phi::GPUContext& dev_ctx,
+                  const phi::DenseTensor& x,
+                  phi::DenseTensor* y,
+                  const TransformOp& transform,
+                  const std::vector<int>& origin_reduce_dims) {
+  auto stream = dev_ctx.stream();
+  dev_ctx.Alloc<Ty>(y);
+
+  auto x_dim = phi::vectorize<int>(x.dims());
+  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
+  config.Run();
+  int numel = x.numel();
+  // after config.run()
+  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
+  // temp_output should be stored temp_data in output_data space or stored in
+  // y_data;
+
+  phi::DDim tmp_ddim;
+  phi::DenseTensor tmp;
+
+  auto x_data = x.data<Tx>();
+  auto y_data = y->data<Ty>();
+
+  if (config.reduce_num == 1) {
+    std::vector<const DenseTensor*> inputs = {&x};
+    std::vector<DenseTensor*> outputs = {y};
+    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
+    return;
+  }
+
+  config.SetOutputData(y_data, dev_ctx, &tmp);
+  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
+  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+#ifndef PADDLE_WITH_XPU_KP
+  if (use_cub_reduce) {
+    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+        x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
+    return;
+  }
+#endif
+
+  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
+  auto reducer = ReduceOp<MPType>();
+  // launch ReduceHigherDimKernel
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
+  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
+  //     32
+  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
+  if (config.reduce_type == ReduceType::kReduceHigherDim) {
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.blocking_size,
+                                        0);
+    dim.SetRem(config.left_num % config.block.x,
+               config.reduce_num % config.blocking_size,
+               0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<Tx,
+                          Ty,
+                          MPType,
+                          ReduceOp<MPType>,
+                          TransformOp><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
+#else
+    ReduceHigherDimKernel<
+        Tx,
+        Ty,
+        MPType,
+        ReduceOp<MPType>,
+        TransformOp><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
+#endif
+
+    if (config.should_reduce_again) {
+      dim3 block = dim3(config.block.x, 1, 1);
+      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
+      kps::DimConfig dim2 =
+          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+      dim2.SetRem(config.left_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+      ReduceHigherDimKernel<
+          Ty,
+          Ty,
+          MPType,
+          ReduceOp<MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
+          config.output_data,
+          y_data,
+          reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+          reducer.initial(),
+          config.grid.y,
+          config.left_num,
+          config.grid.y,
+          dim2);
+#else
+      ReduceHigherDimKernel<
+          Ty,
+          Ty,
+          MPType,
+          ReduceOp<MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+          config.output_data,
+          y_data,
+          reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+          reducer.initial(),
+          config.grid.y,
+          config.left_num,
+          config.grid.y,
+          dim2);
+#endif
+    }
+    return;
+  }
+
+  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+  // function will be used
+  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
+      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
+}
+
+}  // namespace funcs
+
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index ce8e095e8ac6c22edccf8eb295f193809600b455..4e83d0fa3710324f5fddd729d10cb8a541791562 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -33,5 +33,21 @@ struct MeanFunctor {
   }
 };
 
+//////// Prod Functor ///////
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3488b6f2f86b20e0b758f3aa75a6739c40cd81db
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+
+namespace funcs {
+
+// This ReduceGradFunctor is only the CPU implement.
+template <typename Context, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const Context& dev_ctx,
+                       const DenseTensor& input0,
+                       const DenseTensor& input1,
+                       const DenseTensor& input2,
+                       DenseTensor* output,
+                       Functor functor,
+                       const std::vector<int>& dims) {
+  auto x = phi::EigenTensor<T, D>::From(input0);
+  auto x_grad = phi::EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = phi::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = phi::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  functor(place,
+          &x,
+          &x_reduce,
+          &x_grad,
+          &x_reduce_grad,
+          broadcast_dim,
+          broad_cats_times);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledDim(src_dim, &shuffled_dims, dims_64, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename Context, typename T, typename Functor>
+void HandleLargeDimGrad(const Context& dev_ctx,
+                        const DenseTensor* x,
+                        const DenseTensor* out,
+                        const DenseTensor* dout,
+                        DenseTensor* dx,
+                        Functor functor,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  DenseTensor shuffled_x;
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledInput<Context, T>(dev_ctx, *x, &shuffled_x, dims_64);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<Context, T, 2, Functor>(
+      dev_ctx, shuffled_x, *out, *dout, dx, functor, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  DenseTensor dx_tmp;
+  paddle::framework::TensorCopy(*dx, dev_ctx.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  phi::funcs::TransposeNormal<Context, T> trans;
+  trans(dev_ctx, dx_tmp, dx, origin_axis);
+}
+
+// Only for CPU
+template <typename Context, typename T, typename Functor>
+void LaunchReduceGradKernel(const Context& dev_ctx,
+                            const DenseTensor* input0,
+                            const DenseTensor* input1,
+                            const DenseTensor* input2,
+                            DenseTensor* output,
+                            Functor functor,
+                            const std::vector<int>& dims,
+                            bool reduce_all = false) {
+  if (reduce_all) {
+    auto x = phi::EigenVector<T>::Flatten(*input0);
+    auto x_reduce = phi::EigenVector<T>::Flatten(*input1);
+    auto x_reduce_grad = phi::EigenVector<T>::Flatten(*input2);
+    auto x_grad = phi::EigenVector<T>::Flatten(*output);
+    auto& place = *dev_ctx.eigen_device();
+    // *dev_ctx.eigen_device();
+    auto broadcast_dim =
+        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+    functor(place,
+            &x,
+            &x_reduce,
+            &x_grad,
+            &x_reduce_grad,
+            broadcast_dim,
+            broadcast_dim[0]);
+  } else {
+    int rank = input0->dims().size();
+    switch (rank) {
+      case 1:
+        ReduceGradFunctor<Context, T, 1, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 2:
+        ReduceGradFunctor<Context, T, 2, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 3:
+        ReduceGradFunctor<Context, T, 3, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 4:
+        ReduceGradFunctor<Context, T, 4, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 5:
+        ReduceGradFunctor<Context, T, 5, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 6:
+        ReduceGradFunctor<Context, T, 6, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      default:
+        HandleLargeDimGrad<Context, T, Functor>(
+            dev_ctx, input0, input1, input2, output, functor, dims);
+        break;
+    }
+  }
+}
+
+}  // namespace funcs
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
similarity index 67%
rename from paddle/fluid/operators/scatter.cu.h
rename to paddle/phi/kernels/funcs/scatter.cu.h
index 2fea08516d3861018a4760d4992769632b7ab047..f87e8c882c432068bc1670b77296422ea5bd66d5 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -15,20 +15,19 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
-                                      size_t index_size, size_t slice_size) {
+__global__ void ScatterInitCUDAKernel(const IndexT* indices,
+                                      T* output,
+                                      size_t index_size,
+                                      size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -47,9 +46,12 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
-                                  T* output, size_t index_size,
-                                  size_t slice_size, bool overwrite) {
+__global__ void ScatterCUDAKernel(const T* params,
+                                  const IndexT* indices,
+                                  T* output,
+                                  size_t index_size,
+                                  size_t slice_size,
+                                  bool overwrite) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -72,9 +74,12 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
-                                    T* output, const int64_t* output_dims,
-                                    size_t remain_size, size_t slice_size,
+__global__ void ScatterNdCUDAKernel(const T* update,
+                                    const IndexT* indices,
+                                    T* output,
+                                    const int64_t* output_dims,
+                                    size_t remain_size,
+                                    size_t slice_size,
                                     size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -90,7 +95,8 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater or equal to 0, but received [%d]",
-          output_dims[j], index_value);
+          output_dims[j],
+          index_value);
 
       gather_i += (index_value * temp);
       temp *= output_dims[j];
@@ -109,21 +115,24 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
-                      const Tensor& src, const Tensor& index, Tensor* output,
+void GPUScatterAssign(const phi::GPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output,
                       bool overwrite = true) {
   // check index of shape 1-D
-  const auto& ctx = context.device_context();
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() = 2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() = 2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -131,7 +140,7 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -150,23 +159,20 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
 
   // if not overwrite mode, init data
   if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+    ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_index, p_output, index_size, slice_size);
   }
 
-  ScatterCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size, overwrite);
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void GPUScatterGradForX(const phi::GPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   // slice size
@@ -181,21 +187,18 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
   int64_t n = slice_size * index_size;
   int64_t height = (n + block - 1) / block;
 
-  int64_t max_grid_dimx =
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()[0];
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
 
-  ScatterInitCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
-                     const Tensor& update, const Tensor& index,
-                     Tensor* output) {
+template <typename T, typename IndexT = int>
+void GPUScatterNdAdd(const phi::GPUContext& ctx,
+                     const DenseTensor& update,
+                     const DenseTensor& index,
+                     DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
 
@@ -219,31 +222,34 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context,
   const size_t slice_bytes = slice_size * sizeof(T);
   // put output_dims int CUDA
   // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   std::vector<int64_t> v_output_dims(output_dims_size);
   for (int i = 0; i < output_dims_size; ++i) {
     v_output_dims[i] = output_dims[i];
   }
-  auto& dev_ctx = context.cuda_device_context();
+
+  phi::DenseTensor out_dims_tensor;
+  out_dims_tensor.Resize({output_dims_size});
+  auto* g_output_dims = ctx.Alloc<int64_t>(&out_dims_tensor);
   int64_t bytes = output_dims_size * sizeof(int64_t);
-  auto output_dims_ptr = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_output_dims = reinterpret_cast<int64_t*>(output_dims_ptr->ptr());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
+  paddle::memory::Copy(
+      gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  ScatterNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
+  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+      p_update,
+      p_index,
+      p_output,
+      g_output_dims,
+      remain_numel,
+      slice_size,
       end_size);
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/scatter.h b/paddle/phi/kernels/funcs/scatter.h
similarity index 65%
rename from paddle/fluid/operators/scatter.h
rename to paddle/phi/kernels/funcs/scatter.h
index eae82fcd01baaeafb0656ec1a3f7af4fb3765fcc..5d15c955a7f21b486d0771375d73a1db37e4480b 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -15,18 +15,16 @@ limitations under the License. */
 #pragma once
 #include <cstring>
 #include <string>
+#include <unordered_set>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "unordered_set"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
   * Return the updated array pointer, use blas or eigen lib to optimize time
@@ -34,24 +32,31 @@ using Tensor = framework::Tensor;
  */
 template <typename T, typename IndexT = int>
 typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+  blas.VADD(slice_size,
+            src_pointer + src_index * slice_size,
             dst_pointer + dst_index * slice_size,
             dst_pointer + dst_index * slice_size);
 }
 
 template <typename T, typename IndexT = int>
 typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  using EigenVector = typename framework::EigenTensor<T, 1>::Type;
-  using ConstEigenVector = typename framework::EigenTensor<T, 1>::ConstType;
-
-  framework::EigenDim<1>::Type dim;
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  using EigenVector = typename phi::EigenTensor<T, 1>::Type;
+  using ConstEigenVector = typename phi::EigenTensor<T, 1>::ConstType;
+
+  phi::EigenDim<1>::Type dim;
   dim[0] = slice_size;
 
   ConstEigenVector eigen_src(src_pointer + src_index * slice_size, dim);
@@ -67,22 +72,23 @@ elementwise_inner_add(const framework::ExecutionContext& ctx,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                   const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssign(const phi::CPUContext& ctx,
+                   const DenseTensor& src,
+                   const DenseTensor& index,
+                   DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() =2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() =2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -99,12 +105,16 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -115,8 +125,9 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
 
-    PADDLE_ENFORCE_GE(index_, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
@@ -128,20 +139,20 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssignAdd(const phi::CPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output) {
   // check index of shape 1-D
   PADDLE_ENFORCE_EQ(
       index.dims().size() == 1 ||
           (index.dims().size() == 2 && index.dims()[1] == 1),
-      true, platform::errors::InvalidArgument(
-                "index's shape is error, "
-                "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                "but got index'dims shape is %d",
-                index.dims().size()));
+      true,
+      phi::errors::InvalidArgument(
+          "index's shape is error, "
+          "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+          "but got index'dims shape is %d",
+          index.dims().size()));
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
@@ -155,12 +166,16 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -172,36 +187,40 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   auto max_index = dst_dims[0];
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    PADDLE_ENFORCE_GE(index_val, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_val,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be greater than or equal to 0, but received [%d]",
                           index_val));
-    PADDLE_ENFORCE_LT(index_val, max_index,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_val,
+                      max_index,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be less than %d, but received %d",
-                          max_index, index_val));
+                          max_index,
+                          index_val));
     memset(p_output + slice_size * index_val, 0, slice_bytes);
   }
 
   // if not in overwrite mode, need to init output data
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_src, p_output, i, index_val, slice_size);
   }
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void CPUScatterGradForX(const phi::CPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   const IndexT* p_index = index.data<IndexT>();
@@ -216,12 +235,10 @@ void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
-                  const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU"));
-
+void ScatterNdAdd(const phi::CPUContext& ctx,
+                  const DenseTensor& update,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
   // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -250,21 +267,23 @@ void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_EQ(
-          (index_value >= 0 && index_value < output_dims[j]), true,
-          platform::errors::OutOfRange(
+          (index_value >= 0 && index_value < output_dims[j]),
+          true,
+          phi::errors::OutOfRange(
               "The index is out of bounds, "
               "please check whether the dimensions of index and "
               "input meet the requirements. It should "
               "be less than [%d] and greater or equal to 0, but received [%d]",
-              output_dims[j], index_value));
+              output_dims[j],
+              index_value));
 
       index_val += (index_value * temp);
       temp *= output_dims[j];
     }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_update, p_output, i, index_val, slice_size);
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
similarity index 65%
rename from paddle/fluid/operators/math/segment_pooling.cc
rename to paddle/phi/kernels/funcs/segment_pooling.cc
index d16fc570a9fb0dbf0440ca3d1105645056aa13d2..bf4a21f37223dab5a67649406496e9828b0bcf3f 100644
--- a/paddle/fluid/operators/math/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -12,45 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 
-namespace paddle {
-namespace operators {
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
+
+using Tensor = DenseTensor;
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* index,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = input.numel() / input.dims()[0];
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_t = output->Slice(curent_id, curent_id + 1);
       Tensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_e = framework::EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
-      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
 
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       if (pooltype == "MEAN") {
@@ -62,7 +69,7 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MIN") {
         out_e.device(place) = in_e.minimum(reduce_dim);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -75,36 +82,41 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* index = nullptr,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = in_grad->numel() / in_grad->dims()[0];
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
       Tensor in_g_t = in_grad->Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "MEAN") {
@@ -114,13 +126,13 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MAX" || pooltype == "MIN") {
         Tensor out_t = output.Slice(curent_id, curent_id + 1);
         Tensor in_t = input.Slice(last_idx, idx);
-        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
-        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        auto in_e = EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = EigenMatrix<T>::From(out_t, {1, w});
         in_g_e.device(place) =
             (in_e == out_e.broadcast(bcast)).template cast<T>() *
             out_g_e.broadcast(bcast);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -132,7 +144,7 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
   }
 };
 
-using CPU = platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
@@ -142,5 +154,5 @@ template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/segment_pooling.cu
rename to paddle/phi/kernels/funcs/segment_pooling.cu
index bb6d8756bd0a35a2243d8a336c171e2cee51d9b5..305cd39f077bc359543b399a8775b5a92a2eb00d 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,20 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
 #include <algorithm>
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-using Tensor = framework::Tensor;
+using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+__global__ void SegmentSumIdsKernel(const Index* segment_ids,
+                                    T* summed_ids,
                                     const Index input_length_size,
                                     const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
@@ -45,16 +49,19 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "the segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
       if (current_segment_id > last_segment_id) {
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(summed_ids + interval_id) = 0;
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -64,13 +71,15 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
-                                  T* output, T* summed_ids,
+__global__ void SegmentMeanKernel(const Index* segment_ids,
+                                  const T* input,
+                                  T* output,
+                                  T* summed_ids,
                                   const Index input_length_size,
                                   const Index inner_dim_size,
                                   const Index output_length_size,
@@ -93,7 +102,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
 
@@ -102,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+            paddle::platform::CudaAtomicAdd(
+                output + output_index, sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -114,15 +124,14 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    platform::CudaAtomicAdd(output + output_index,
-                            sum / *(summed_ids + last_segment_id));
+    paddle::platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
   }
 }
 
 template <typename T, typename Index, typename Helper, typename Pool>
-__global__ void __launch_bounds__(1024, 1)
-    SegmentOpsKernel(const Index* segment_ids, const T* input, T* output,
-                     Helper h, Pool pool) {
+__global__ void __launch_bounds__(1024, 1) SegmentOpsKernel(
+    const Index* segment_ids, const T* input, T* output, Helper h, Pool pool) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
@@ -142,13 +151,16 @@ __global__ void __launch_bounds__(1024, 1)
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "The segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
 
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
@@ -175,9 +187,12 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <typename T, typename Index, typename Helper>
-__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
-                                       const T* output, const T* out_grad,
-                                       T* in_grad, Helper h) {
+__global__ void SegmentIndexGradKernel(const Index* segment_ids,
+                                       const T* input,
+                                       const T* output,
+                                       const T* out_grad,
+                                       T* in_grad,
+                                       Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
@@ -201,7 +216,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMax(address, val);
+    return paddle::platform::CudaAtomicMax(address, val);
   }
 };
 
@@ -211,7 +226,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMin(address, val);
+    return paddle::platform::CudaAtomicMin(address, val);
   }
 };
 
@@ -221,7 +236,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicAdd(address, val);
+    return paddle::platform::CudaAtomicAdd(address, val);
   }
 };
 
@@ -243,8 +258,10 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T* segment_offset,
-                               T* dim_index_base, T* actual_height) {
+  DEVICE inline void calculate(T stripe_index,
+                               T* segment_offset,
+                               T* dim_index_base,
+                               T* actual_height) {
     *segment_offset = stripe_index % inner_dim_size;
     *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
     *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
@@ -252,23 +269,32 @@ class ArrangeHelper {
 };
 
 template <typename T, typename Index>
-void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
-                                const framework::Tensor& input,
-                                const framework::Tensor& segment_ids,
-                                const framework::Tensor& output,
-                                const framework::Tensor& out_grad,
-                                framework::Tensor* in_grad,
+void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
+                                const DenseTensor& input,
+                                const DenseTensor& segment_ids,
+                                const DenseTensor& output,
+                                const DenseTensor& out_grad,
+                                DenseTensor* in_grad,
                                 const std::string pooltype = "SUM") {
-  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
-                                output.dims()[0]);
-  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  auto h = ArrangeHelper<Index>(
+      input.numel(), segment_ids.dims()[0], output.dims()[0]);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
-        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
-        out_grad.data<T>(), in_grad->data<T>(), h);
+    SegmentIndexGradKernel<T,
+                           Index,
+                           ArrangeHelper<Index>><<<config.block_per_grid.x,
+                                                   config.thread_per_block.x,
+                                                   0,
+                                                   ctx.stream()>>>(
+        segment_ids.data<Index>(),
+        input.data<T>(),
+        output.data<T>(),
+        out_grad.data<T>(),
+        in_grad->data<T>(),
+        h);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
         "available, but got %s.",
         pooltype));
@@ -291,13 +317,13 @@ __global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
 }
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segment_ids,
-                  framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segment_ids,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MEAN") {
       // Sum the segment id num first
@@ -305,50 +331,76 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
       auto input_length_size = segment_ids.numel();
       auto total_stripe_count =
           (input_length_size + DimTileSize - 1) / DimTileSize;
-      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                  config.thread_per_block.x,
+                                                  0,
+                                                  ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          summed_ids->data<T>(),
+          input_length_size,
           total_stripe_count);
     }
 
-    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
-                                   output->dims()[0]);
-    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    auto h = ArrangeHelper<IndexT>(
+        input.numel(), segment_ids.dims()[0], output->dims()[0]);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
-          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
-          h.output_length_size, h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                config.thread_per_block.x,
+                                                0,
+                                                ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          input.data<T>(),
+          output->data<T>(),
+          summed_ids->data<T>(),
+          h.input_length_size,
+          h.inner_dim_size,
+          h.output_length_size,
+          h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       SumPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MaxPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MinPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -357,33 +409,38 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
-      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
-                                            out_grad, in_grad, pooltype);
+      SegmentPoolCUDAGradFunctor<T, IndexT>(
+          dev_ctx, input, segments, output, out_grad, in_grad, pooltype);
     } else if (pooltype == "MEAN") {
-      framework::Tensor mean_grad;
-      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
-      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      DenseTensor mean_grad;
+      mean_grad.Resize(input.dims());
+      dev_ctx.template Alloc<T>(&mean_grad);
+      paddle::framework::TensorCopy(
+          out_grad, dev_ctx.GetPlace(), dev_ctx, &mean_grad);
       int len = output.dims()[0];
       int dim = output.numel() / len;
-      auto config = platform::GetGpuLaunchConfig1D(context, len);
-      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                     context.stream()>>>(mean_grad.data<T>(),
-                                         summed_ids->data<T>(), len, dim);
-      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len);
+      SimpleDiv<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(
+          mean_grad.data<T>(), summed_ids->data<T>(), len, dim);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, out_grad, segments, in_grad);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -391,15 +448,15 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
   }
 };
 
-using CUDA = paddle::platform::CUDADeviceContext;
-template class SegmentPoolFunctor<CUDA, float, int>;
-template class SegmentPoolFunctor<CUDA, float, int64_t>;
-template class SegmentPoolFunctor<CUDA, double, int>;
-template class SegmentPoolFunctor<CUDA, double, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, float, int>;
-template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, double, int>;
-template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle
+using GPU = phi::GPUContext;
+template class SegmentPoolFunctor<GPU, float, int>;
+template class SegmentPoolFunctor<GPU, float, int64_t>;
+template class SegmentPoolFunctor<GPU, double, int>;
+template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, float, int>;
+template class SegmentPoolGradFunctor<GPU, float, int64_t>;
+template class SegmentPoolGradFunctor<GPU, double, int>;
+template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
similarity index 51%
rename from paddle/fluid/operators/math/segment_pooling.h
rename to paddle/phi/kernels/funcs/segment_pooling.h
index 561fad6921fe7b9e61f6ea4bc33d820a6af25262..b8281061582ea6cddb2dbcbf490c3dfa50c4f64f 100644
--- a/paddle/fluid/operators/math/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,33 +14,36 @@ limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM");
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolGradFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM");
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc
similarity index 56%
rename from paddle/fluid/operators/math/sequence2batch.cc
rename to paddle/phi/kernels/funcs/sequence2batch.cc
index 852700fa7ff3c1f992eb48b5c684cfaad96cfc31..0d75ba877db5e361b75d336bdd0885555fc2eacf 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
@@ -12,47 +12,45 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
+class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Vector<size_t> index_lod,
+                  paddle::framework::Tensor* dst,
                   bool is_src_index) {
     size_t* index = index_lod.data();
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(src_dims.size(),
+                      2UL,
+                      phi::errors::InvalidArgument(
                           "The source tensor must be a matrix with rank 2, but "
                           "got the source tensor rank is %lu. "
                           "Please check the rank of the source tensor",
                           src_dims.size()));
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(dst_dims.size(),
+                      2UL,
+                      phi::errors::InvalidArgument(
                           "The destination tensor must be a matrix with rank, "
                           "but got the destination tensor rank is %lu. "
                           "Please check the rank of the destination tensor",
                           dst_dims.size()));
     PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1],
-        platform::errors::InvalidArgument(
+        src_dims[1],
+        dst_dims[1],
+        phi::errors::InvalidArgument(
             "The width of the source tensor and the destination tensor must be "
             "same. But got %lu != %lu.Please check the rank of the source "
             "tensor",
-            src_dims.size(), dst_dims.size()));
+            src_dims.size(),
+            dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
@@ -70,14 +68,18 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
+template class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext,
+                                     double>;
 
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
+template class LoDTensor2BatchFunctor<paddle::platform::CPUDeviceContext,
+                                      float>;
+template class LoDTensor2BatchFunctor<paddle::platform::CPUDeviceContext,
+                                      double>;
+template class Batch2LoDTensorFunctor<paddle::platform::CPUDeviceContext,
+                                      float>;
+template class Batch2LoDTensorFunctor<paddle::platform::CPUDeviceContext,
+                                      double>;
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu
similarity index 55%
rename from paddle/fluid/operators/math/sequence2batch.cu
rename to paddle/phi/kernels/funcs/sequence2batch.cu
index f56c5293971bce3b43e86686e828fad4c90639f5..a66030e642628e8afaea7766ec96b7a23faf429b 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/phi/kernels/funcs/sequence2batch.cu
@@ -11,15 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
-                                     int64_t height, int64_t width,
+__global__ void CopyMatrixRowsKernel(const T* src,
+                                     T* dst,
+                                     const size_t* index,
+                                     int64_t height,
+                                     int64_t width,
                                      bool is_src_index) {
   int idx = threadIdx.x;
   int idy = threadIdx.y;
@@ -37,33 +39,38 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
 }
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
+class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Vector<size_t> index_lod,
+                  paddle::framework::Tensor* dst,
                   bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(src_dims.size(),
+                      2,
+                      phi::errors::InvalidArgument(
                           "The source tensor must be a matrix with rank 2, but "
                           "got the source tensor rank is %lu. "
                           "Please check the rank of the source tensor",
                           src_dims.size()));
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(dst_dims.size(),
+                      2,
+                      phi::errors::InvalidArgument(
                           "The destination tensor must be a matrix with rank, "
                           "but got the destination tensor rank is %lu. "
                           "Please check the rank of the destination tensor",
                           dst_dims.size()));
     PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1],
-        platform::errors::InvalidArgument(
+        src_dims[1],
+        dst_dims[1],
+        phi::errors::InvalidArgument(
             "The width of the source tensor and the destination tensor must be "
             "same. But got %lu != %lu.Please check the rank of the source "
             "tensor",
-            src_dims.size(), dst_dims.size()));
+            src_dims.size(),
+            dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
@@ -74,19 +81,28 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto stream = context.stream();
     paddle::framework::MixVector<size_t> mix_index_lod(&index_lod);
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height,
-        width, is_src_index);
+        src_data,
+        dst_data,
+        mix_index_lod.CUDAData(context.GetPlace()),
+        height,
+        width,
+        is_src_index);
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
+template class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext,
+                                     float>;
+template class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext,
+                                     double>;
 
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
+template class LoDTensor2BatchFunctor<paddle::platform::CUDADeviceContext,
+                                      float>;
+template class LoDTensor2BatchFunctor<paddle::platform::CUDADeviceContext,
+                                      double>;
+template class Batch2LoDTensorFunctor<paddle::platform::CUDADeviceContext,
+                                      float>;
+template class Batch2LoDTensorFunctor<paddle::platform::CUDADeviceContext,
+                                      double>;
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/phi/kernels/funcs/sequence2batch.h
similarity index 80%
rename from paddle/fluid/operators/math/sequence2batch.h
rename to paddle/phi/kernels/funcs/sequence2batch.h
index 6aa513e4d10eef49c02417e98b31cddd57088d7c..e7c387fb99b9c7f15cdee5ec87cad246e4b34f48 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/phi/kernels/funcs/sequence2batch.h
@@ -20,13 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
-template <typename T, int MajorType = Eigen::RowMajor,
+template <typename T,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using EigenMatrix = paddle::framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class CopyMatrixRowsFunctor {
@@ -36,8 +36,10 @@ class CopyMatrixRowsFunctor {
   // If is_src_index is false,
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
-  void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Vector<size_t> index_lod,
+                  paddle::framework::Tensor* dst,
                   bool is_src_index);
 };
 
@@ -59,32 +61,37 @@ class LoDTensor2BatchFunctor {
 
  public:
   void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor* batch, bool is_cal_batch_lod,
+                  const paddle::framework::LoDTensor& lod_tensor,
+                  paddle::framework::LoDTensor* batch,
+                  bool is_cal_batch_lod,
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
       PADDLE_ENFORCE_GT(
-          lods.size(), 2UL,
-          platform::errors::InvalidArgument(
+          lods.size(),
+          2UL,
+          phi::errors::InvalidArgument(
               "The LoD of LoDTensor should inlcude at least 2-level "
               "sequence information, but got the LoD level is %lu. Please "
               "check the input value.",
               lods.size()));
       PADDLE_ENFORCE_EQ(
-          lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          platform::errors::InvalidArgument(
+          lods[1].size(),
+          static_cast<size_t>(lod_tensor.dims()[0]),
+          phi::errors::InvalidArgument(
               "The LoD information should be consistent with the dims, but got "
               "%lu != %lu. Please check the input value.",
-              lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0])));
+              lods[1].size(),
+              static_cast<size_t>(lod_tensor.dims()[0])));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(lods.size(),
+                      1UL,
+                      phi::errors::InvalidArgument(
                           "Only support one level sequence now, but got the "
                           "LoD level is %lu. Please check the input value.",
                           lods.size()));
@@ -97,8 +104,9 @@ class LoDTensor2BatchFunctor {
       seq_info.emplace_back(lod[seq_id], length, seq_id);
     }
 
-    std::sort(seq_info.begin(), seq_info.end(),
-              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
+      return a.length > b.length;
+    });
 
     // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
@@ -169,27 +177,29 @@ template <typename DeviceContext, typename T>
 class Batch2LoDTensorFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& batch,
-                  framework::LoDTensor* lod_tensor) const {
+                  const paddle::framework::LoDTensor& batch,
+                  paddle::framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
     PADDLE_ENFORCE_GT(
-        in_lod.size(), 2UL,
-        platform::errors::InvalidArgument(
+        in_lod.size(),
+        2UL,
+        phi::errors::InvalidArgument(
             "The LoD of LoDTensor should inlcude at least 2-level "
             "sequence information, but got the LoD level is %lu. Please check "
             "the input value.",
             in_lod.size()));
     PADDLE_ENFORCE_EQ(
-        in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        platform::errors::InvalidArgument(
+        in_lod[1].size(),
+        static_cast<size_t>(lod_tensor->dims()[0]),
+        phi::errors::InvalidArgument(
             "The LoD information should be consistent with the dims, but got "
             "%lu != %lu. Please check the input value.",
-            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0])));
+            in_lod[1].size(),
+            static_cast<size_t>(lod_tensor->dims()[0])));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a50dceb0a00758b2b0ad5f92219812083cb5f24
--- /dev/null
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T, size_t D>
+void EigenSliceWrapper(const Context& dev_ctx,
+                       const DenseTensor* in,
+                       const std::vector<int>& start,
+                       const std::vector<int>& end,
+                       DenseTensor* out) {
+  // Slice by call Eigen Tensor Function `.slice()`
+  size_t rank = in->dims().size();
+  PADDLE_ENFORCE_EQ(start.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function start "
+                        "argument must have the same length as input rank."));
+  PADDLE_ENFORCE_EQ(end.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function end "
+                        "argument must have the same length as input rank."));
+  auto eigen_place_ptr = dev_ctx.eigen_device();
+  auto eigen_place = *eigen_place_ptr;
+  auto out_t = phi::EigenTensor<T, D>::From(*out, out->dims());
+  auto in_t = phi::EigenTensor<T, D>::From(*in, in->dims());
+  Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+  for (size_t i = 0; i < D; i++) {
+    offsets_32bit[i] = start[i];
+    extents_32bit[i] = end[i];
+  }
+  EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+      eigen_place,
+      phi::To32BitIndex(out_t),
+      phi::To32BitIndex(in_t),
+      offsets_32bit,
+      extents_32bit);
+}
+
+#define SLICE_RANK_CASE(N)                                                \
+  case N: {                                                               \
+    EigenSliceWrapper<Context, T, N>(dev_ctx, &x, offset, extends, &ret); \
+    break;                                                                \
+  }
+
+template <typename T, typename Context>
+DenseTensor Slice(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  std::vector<int> axes,
+                  std::vector<int> starts,
+                  std::vector<int> ends) {
+  DenseTensor ret;
+  std::vector<int> new_axes = axes;
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  size_t rank = out_shape.size();
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  PADDLE_ENFORCE_EQ(
+      ends.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  for (unsigned int i = 0; i < axes.size(); ++i) {
+    int axis = axes[i];
+    if (axis < 0) axis = rank + axis;
+    new_axes[i] = axis;  // change negative to positive
+    int st = starts[i];
+    int ed = ends[i];
+    PADDLE_ENFORCE_GT(
+        ed,
+        st,
+        errors::InvalidArgument("C++ Slice Operation Not Support End < Start"));
+    out_shape[axis] = ed - st;
+  }
+  std::vector<int> offset(rank), extends(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    offset[i] = 0;
+    extends[i] = x.dims()[i];
+  }
+  for (size_t i = 0; i < new_axes.size(); ++i) {
+    offset[new_axes[i]] = starts[i];
+    extends[new_axes[i]] = ends[i] - starts[i];
+  }
+  ret.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&ret);
+  switch (rank) {
+    SLICE_RANK_CASE(1);
+    SLICE_RANK_CASE(2);
+    SLICE_RANK_CASE(3);
+    SLICE_RANK_CASE(4);
+    SLICE_RANK_CASE(5);
+    SLICE_RANK_CASE(6);
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Invalid Rank number, "
+                                  "currently only support rank between 2~6"));
+    }
+  }
+  return ret;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b8a81471ef769dc5ddf18889f60813641d86d22
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
+  // don't copy data, only change the dims
+  DenseTensor out(x);
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  if (axis >= 0) {
+    auto index = (out_shape.begin() + axis);
+    out_shape.insert(index, 1);
+  } else if (axis < 0) {
+    auto index = (out_shape.end() + axis + 1);
+    out_shape.insert(index, 1);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  return out;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3189fc5cc3c307f04758663250098f384c2c8fc
--- /dev/null
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/memory.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cusolver.h"
+#endif  // PADDLE_WITH_CUDA
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+inline int64_t GetBatchSize(phi::DDim dims) {
+  int64_t batch_size = 1;
+  auto dim_size = dims.size();
+  for (int i = 0; i < dim_size - 2; i++) {
+    batch_size *= dims[i];
+  }
+  return batch_size;
+}
+
+static void CheckEighResult(const int batch, const int info) {
+  PADDLE_ENFORCE_LE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
+          "tridiagonal form did not converge to zero",
+          batch,
+          info));
+  PADDLE_ENFORCE_GE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] argument had an illegal value",
+          batch,
+          info));
+}
+
+template <typename DeviceContext, typename T>
+struct MatrixEighFunctor {
+  void operator()(const DeviceContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors);
+};
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices, and uses the variable has_vectors to
+// control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<CPUContext, T> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    auto dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    int values_stride = dims[dim_size - 1];
+    char uplo = is_lower ? 'L' : 'U';
+    char jobz = has_vectors ? 'V' : 'N';
+    int n = dims[dim_size - 1];
+    int64_t lda = std::max<int64_t>(1, n);
+    // if work = -1, it means that you need to use the lapack function to query
+    // the optimal value
+    int lwork = -1;      // The length of the array work
+    int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
+    int liwork = -1;     // The dimension of the array iwork
+    int iwork_opt = -1;  // The optimal length of the array liwork
+    T lwork_opt = static_cast<T>(-1);  // The optimal length of the array work
+    ValueType rwork_opt =
+        static_cast<ValueType>(-1);  // The optimal length of the array rwork
+
+    int info = 0;
+    // Call lapackEigh to get the optimal size of work data
+    phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                         uplo,
+                                         n,
+                                         input_vector,
+                                         lda,
+                                         out_value,
+                                         &lwork_opt,
+                                         lwork,
+                                         &rwork_opt,
+                                         lrwork,
+                                         &iwork_opt,
+                                         liwork,
+                                         &info);
+    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
+    liwork = std::max<int>(1, iwork_opt);
+
+    DenseTensor rwork_tensor;
+    ValueType *rwork_data = nullptr;
+
+    // complex type
+    if (input.type() == phi::DataType::COMPLEX64 ||
+        input.type() == phi::DataType::COMPLEX128) {
+      lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
+
+      rwork_tensor.Resize(phi::make_ddim({lrwork}));
+      rwork_data = dev_ctx.template Alloc<ValueType>(&rwork_tensor);
+    }
+
+    DenseTensor iwork_tensor, work_tensor;
+
+    iwork_tensor.Resize(phi::make_ddim({liwork}));
+    int *iwork_data = dev_ctx.template Alloc<int>(&iwork_tensor);
+
+    work_tensor.Resize(phi::make_ddim({lwork}));
+    T *work_data = dev_ctx.template Alloc<T>(&work_tensor);
+
+    for (auto i = 0; i < batch_size; i++) {
+      auto *value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                           uplo,
+                                           n,
+                                           input_data,
+                                           lda,
+                                           value_data,
+                                           work_data,
+                                           lwork,
+                                           rwork_data,
+                                           lrwork,
+                                           iwork_data,
+                                           liwork,
+                                           &info);
+      CheckEighResult(i, info);
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated, "
+                                  "so the eigenvectors must be provided."));
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices on GPU, and uses the variable has_vectors
+// to control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<GPUContext, T> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+    auto &dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    cublasFillMode_t uplo =
+        is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+    cusolverEigMode_t jobz =
+        has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
+
+    int n = dims[dim_size - 1];
+    int lda = std::max<int>(1, n);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
+    int lwork = 0;
+    auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
+    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
+
+    // When the input type is float32, and the feature value input dimension
+    // is greater than or equal to [*,32,32]  and less than or equal to
+    // [*,512,512], Syevj has better performance.
+    bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
+                      values_stride >= 32 && values_stride <= 512);
+    syevjInfo_t syevj_params;
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+          dev_ctx.cusolver_dn_handle(),
+          jobz,
+          uplo,
+          n,
+          reinterpret_cast<const float *>(input_vector),
+          lda,
+          reinterpret_cast<const float *>(out_value),
+          &lwork,
+          syevj_params));
+    } else {
+      EvdBuffer(dev_ctx.cusolver_dn_handle(),
+                jobz,
+                uplo,
+                n,
+                input_vector,
+                lda,
+                out_value,
+                &lwork);
+    }
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    for (auto i = 0; i < batch_size; i++) {
+      auto *input_data = input_vector + i * vector_stride;
+      auto *value_data = out_value + i * values_stride;
+      auto handle = dev_ctx.cusolver_dn_handle();
+      if (use_syevj) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::cusolverDnSsyevj(handle,
+                                      jobz,
+                                      uplo,
+                                      n,
+                                      reinterpret_cast<float *>(input_data),
+                                      lda,
+                                      reinterpret_cast<float *>(value_data),
+                                      reinterpret_cast<float *>(work_ptr),
+                                      lwork,
+                                      info_ptr,
+                                      syevj_params));
+      } else {
+        Evd(handle,
+            jobz,
+            uplo,
+            n,
+            input_data,
+            lda,
+            value_data,
+            work_ptr,
+            lwork,
+            info_ptr);
+      }
+      int error_info = 0;
+      paddle::memory::Copy(phi::CPUPlace(),
+                           &error_info,
+                           dev_ctx.GetPlace(),
+                           info_ptr,
+                           sizeof(int),
+                           dev_ctx.stream());
+      CheckEighResult(i, error_info);
+    }
+
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnDestroySyevjInfo(syevj_params));
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated,"
+                                  "so the eigenvectors must be provided."));
+      //   input_trans = dito.Transpose(input_trans);
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+
+  using ValueType = phi::dtype::Real<T>;
+  inline void EvdBuffer(cusolverDnHandle_t handle,
+                        cusolverEigMode_t jobz,
+                        cublasFillMode_t uplo,
+                        int n,
+                        const T *A,
+                        int lda,
+                        const ValueType *W,
+                        int *lwork) const;
+
+  inline void Evd(cusolverDnHandle_t handle,
+                  cusolverEigMode_t jobz,
+                  cublasFillMode_t uplo,
+                  int n,
+                  T *A,
+                  int lda,
+                  ValueType *W,
+                  T *work,
+                  int lwork,
+                  int *devInfo) const;
+};
+
+using phi::dtype::complex;
+
+#define FUNC_WITH_TYPES(m)                       \
+  m(float, Ssy, float) m(double, Dsy, double) m( \
+      complex<float>, Che, cuComplex) m(complex<double>, Zhe, cuDoubleComplex)
+
+#define EVDBUFFER_INSTANCE(T, C, CastType)                             \
+  template <>                                                          \
+  inline void MatrixEighFunctor<GPUContext, T>::EvdBuffer(             \
+      cusolverDnHandle_t handle,                                       \
+      cusolverEigMode_t jobz,                                          \
+      cublasFillMode_t uplo,                                           \
+      int n,                                                           \
+      const T *A,                                                      \
+      int lda,                                                         \
+      const ValueType *W,                                              \
+      int *lwork) const {                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##evd_bufferSize( \
+        handle,                                                        \
+        jobz,                                                          \
+        uplo,                                                          \
+        n,                                                             \
+        reinterpret_cast<const CastType *>(A),                         \
+        lda,                                                           \
+        W,                                                             \
+        lwork));                                                       \
+  }
+
+FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
+
+#define EVD_INSTANCE(T, C, CastType)                                           \
+  template <>                                                                  \
+  inline void MatrixEighFunctor<GPUContext, T>::Evd(cusolverDnHandle_t handle, \
+                                                    cusolverEigMode_t jobz,    \
+                                                    cublasFillMode_t uplo,     \
+                                                    int n,                     \
+                                                    T *A,                      \
+                                                    int lda,                   \
+                                                    ValueType *W,              \
+                                                    T *work,                   \
+                                                    int lwork,                 \
+                                                    int *devInfo) const {      \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
+        dynload::cusolverDn##C##evd(handle,                                    \
+                                    jobz,                                      \
+                                    uplo,                                      \
+                                    n,                                         \
+                                    reinterpret_cast<CastType *>(A),           \
+                                    lda,                                       \
+                                    W,                                         \
+                                    reinterpret_cast<CastType *>(work),        \
+                                    lwork,                                     \
+                                    devInfo));                                 \
+  }
+
+FUNC_WITH_TYPES(EVD_INSTANCE);
+
+#undef FUNC_WITH_TYPES
+#undef EVDBUFFER_INSTANCE
+#undef EVD_INSTANCE
+
+#endif  // PADDLE_WITH_CUDA
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/viterbi_decode_functor.h b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..b80fd5356b6e836b49b8868315b6c65342b1d84a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T, typename Functor, typename OutT = T>
+void SameDimsBinaryOP(const DenseTensor& lhs,
+                      const DenseTensor& rhs,
+                      DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  OutT* out_ptr = out->data<OutT>();
+  Functor functor;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
+  }
+}
+
+template <bool is_multi_threads>
+struct GetInputIndex {
+  void operator()(const std::vector<int>& lhs_dims,
+                  const std::vector<int>& rhs_dims,
+                  const std::vector<int>& output_dims,
+                  const std::vector<int>& lhs_strides,
+                  const std::vector<int>& rhs_strides,
+                  const std::vector<int>& output_strides,
+                  int output_idx,
+                  int* index_array,
+                  int* lhs_idx,
+                  int* rhs_idx) {
+    int out_dims_size = output_strides.size();
+    for (int j = 0; j < out_dims_size; ++j) {
+      int curr_idx = output_idx / output_strides[j];
+      output_idx %= output_strides[j];
+      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
+      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
+    }
+  }
+};
+
+template <typename T, typename Functor, bool is_multi_threads = false>
+void SimpleBroadcastBinaryOP(const DenseTensor& lhs,
+                             const DenseTensor& rhs,
+                             DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  T* out_ptr = out->data<T>();
+  int out_size = static_cast<int>(out->dims().size());
+  std::vector<int> out_dims(out_size);
+  std::vector<int> lhs_dims(out_size);
+  std::vector<int> rhs_dims(out_size);
+  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
+  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
+  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
+  std::vector<int> output_strides(out_size, 1);
+  std::vector<int> lhs_strides(out_size, 1);
+  std::vector<int> rhs_strides(out_size, 1);
+  std::vector<int> index_array(out_size, 0);
+  // calculate strides
+  for (int i = out_size - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
+    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
+    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
+  }
+  Functor functor;
+  GetInputIndex<is_multi_threads> get_input_index;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    int lhs_idx = 0;
+    int rhs_idx = 0;
+    get_input_index(lhs_dims,
+                    rhs_dims,
+                    out_dims,
+                    lhs_strides,
+                    rhs_strides,
+                    output_strides,
+                    i,
+                    index_array.data(),
+                    &lhs_idx,
+                    &rhs_idx);
+    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
+  }
+}
+
+class TensorBuffer {
+ public:
+  explicit TensorBuffer(const DenseTensor& in) : buffer_(in), offset_(0) {
+    buffer_.Resize({buffer_.numel()});
+  }
+  DenseTensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+    int64_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+    DenseTensor block = buffer_.Slice(offset_, offset_ + size);
+    offset_ += size;
+    block.Resize(shape);
+    return block;
+  }
+
+ private:
+  DenseTensor buffer_;  // need to resize 1-D Tensor
+  int offset_;
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/yolo_box_util.h b/paddle/phi/kernels/funcs/yolo_box_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..337af2d7a236e9ea93c4eecf835ad9ca446b5276
--- /dev/null
+++ b/paddle/phi/kernels/funcs/yolo_box_util.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box,
+                                  const T* x,
+                                  const int* anchors,
+                                  int i,
+                                  int j,
+                                  int an_idx,
+                                  int grid_size_h,
+                                  int grid_size_w,
+                                  int input_size_h,
+                                  int input_size_w,
+                                  int index,
+                                  int stride,
+                                  int img_height,
+                                  int img_width,
+                                  float scale,
+                                  float bias) {
+  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
+  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
+           grid_size_h;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size_w;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size_h;
+}
+
+HOSTDEVICE inline int GetEntryIndex(int batch,
+                                    int an_idx,
+                                    int hw_idx,
+                                    int an_num,
+                                    int an_stride,
+                                    int stride,
+                                    int entry,
+                                    bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+HOSTDEVICE inline int GetIoUIndex(
+    int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes,
+                                        T* box,
+                                        const int box_idx,
+                                        const int img_height,
+                                        const int img_width,
+                                        bool clip_bbox) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+    boxes[box_idx + 1] =
+        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                             ? boxes[box_idx + 2]
+                             : static_cast<T>(img_width - 1);
+    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                             ? boxes[box_idx + 3]
+                             : static_cast<T>(img_height - 1);
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores,
+                                      const T* input,
+                                      const int label_idx,
+                                      const int score_idx,
+                                      const int class_num,
+                                      const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_nd_grad_kernel.h b/paddle/phi/kernels/gather_nd_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..050034714957fe749d6608243dfedc4d30d66e88
--- /dev/null
+++ b/paddle/phi/kernels/gather_nd_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradNdKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_nd_kernel.h b/paddle/phi/kernels/gather_nd_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2393eb3b0709345cb2d3ec63d739cb223a79683
--- /dev/null
+++ b/paddle/phi/kernels/gather_nd_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_tree_kernel.h b/paddle/phi/kernels/gather_tree_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5a1a684daef099b5da8e7d9b8469b2857c29a6b
--- /dev/null
+++ b/paddle/phi/kernels/gather_tree_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2903d80d22d46bcdc492009afdac4e6e6572929e
--- /dev/null
+++ b/paddle/phi/kernels/gaussian_random_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index e122e6b1e9c8abe977ec5688a2ffddecadc776fb..5c424316a83df720ae46790af151330106839da6 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -27,14 +27,14 @@ template <typename T, typename Enable = void>
 struct CudaAbsFunctor;
 
 template <typename T>
-struct CudaAbsFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  __device__ __forceinline__ phi::funcs::Real<T> operator()(const T x) const {
+struct CudaAbsFunctor<T, phi::funcs::Complex<T, phi::dtype::Real<T>>> {
+  __device__ __forceinline__ phi::dtype::Real<T> operator()(const T x) const {
     return abs(x);
   }
 };
 
 template <typename T>
-struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
+struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::dtype::Real<T>>> {
   __device__ __forceinline__ T operator()(const T x) const {
     return std::abs(x);
   }
@@ -42,12 +42,12 @@ struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
 
 template <typename T, typename Context>
 void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
-  ctx.template Alloc<phi::funcs::Real<T>>(out);
+  ctx.template Alloc<phi::dtype::Real<T>>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
   auto functor = CudaAbsFunctor<T>();
 
-  funcs::ElementwiseKernel<phi::funcs::Real<T>>(ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<phi::dtype::Real<T>>(ctx, ins, &outs, functor);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f08fb74e54d8c86f7b54d21c762e30cebedfe967
--- /dev/null
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   float* accuracy,
+                                   int* total_data) {
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+// reduce the count with init value 0, and output accuracy.
+#ifdef PADDLE_WITH_CUDA
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+#else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+#endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<
+      PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      num_samples,
+      infer_width,
+      indices_data,
+      label_data,
+      correct_data,
+      accuracy_data,
+      total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyRawKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c2995c79a7e8c2651ed4aa16d75d59c8f24c96dc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -0,0 +1,221 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradGPUImpl(const Context& dev_ctx,
+                           const DenseTensor* x,
+                           const DenseTensor* out,
+                           const DenseTensor* d_out,
+                           DenseTensor* d_x,
+                           const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      d_x, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!out) {
+    out = d_out;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        x, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    x = d_x;
+  }
+
+  dev_ctx.template Alloc<T>(d_x);
+
+  std::vector<const DenseTensor*> ins = {d_out};
+  std::vector<DenseTensor*> outs = {d_x};
+
+  if (static_cast<int>(Functor::FwdDeps()) ==
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    // Only need forward output Out
+    ins.push_back(out);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else if (static_cast<int>(Functor::FwdDeps()) ==
+             static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    // Only need forward input X
+    ins.push_back(x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class functor;                                          \
+    ActivationGradGPUImpl<T, Context, functor_class>(               \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class functor;                                            \
+    ActivationGradGPUImpl<T, Context, functor_class>(                 \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
+
+}  // namespace phi
+PD_REGISTER_KERNEL(cos_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CosGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(tan_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TanGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acos_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcosGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atan_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sinh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(cosh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoshGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asinh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acosh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcoshGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atanh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..26752b89e7c345f88cdbe2000b119c07507d2c37
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -0,0 +1,143 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGPUImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out,
+                       const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(                                                        \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
+    functor_class functor;                                                  \
+    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+PD_REGISTER_KERNEL(
+    sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acos,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcosKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asin,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sinh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(cosh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoshKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asinh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acosh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcoshKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atanh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7516a277a746f12ec7c6326b7ebd8f64c789fc31
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0817c531318c390ead201b1fda18da511ea5569f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6feee512cc9f4ec411167d1dc26feed1d766787d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include <limits>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer,
+                              const T init,
+                              const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const phi::GPUContext& dev_ctx,
+                    const DenseTensor& input,
+                    DenseTensor* indices,
+                    const int64_t pre,
+                    const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = dev_ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    auto block_size = 8;
+    if (col > 512)
+      block_size = 1024;
+    else if (col > 256)
+      block_size = 512;
+    else if (col > 128)
+      block_size = 256;
+    else if (col > 64)
+      block_size = 128;
+    else if (col > 32)
+      block_size = 64;
+    else if (col > 16)
+      block_size = 32;
+    else if (col > 8)
+      block_size = 16;
+#ifdef __HIPCC__
+    block_size = std::min(block_size, 256);
+#endif
+    return block_size;
+  };
+
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = dev_ctx.template Alloc<IndType>(indices);
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::lowest(),
+              in_data,
+              out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::max(),
+              in_data,
+              out_data));
+    }
+  }
+}
+
+template <typename Context, typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         int64_t axis,
+                                         bool keepdims,
+                                         bool flatten,
+                                         DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+
+  template <typename IndType>
+  void apply() const {
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x.dims().size();
+    }
+
+    int64_t numel = x.numel();
+    int64_t groups = numel / x_dims[new_axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = x_dims[new_axis];
+
+    for (int i = 0; i < new_axis; i++) {
+      pre *= x_dims[i];
+    }
+
+    for (int i = new_axis + 1; i < x_dims.size(); i++) {
+      post *= x_dims[i];
+    }
+
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, x, out, pre, post, n);
+  }
+};
+
+template <typename Context, typename T, class Reducer>
+void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           int64_t axis,
+                           bool keepdims,
+                           bool flatten,
+                           int dtype,
+                           DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+#endif
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15bca474f58c38edc3faf3c4f55e5066439eb0ad
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -0,0 +1,217 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO,
+                                       const IndType* indices,
+                                       int64_t size,
+                                       T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
+template <typename T, typename IndType>
+static __global__ void FillGrad(const T* dO,
+                                const IndType* indices,
+                                T* dX,
+                                IndType num_rows,
+                                IndType num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
+    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
+      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
+    }
+  }
+}
+
+template <typename T, typename IndType>
+void ArgFullAssign(const phi::GPUContext& ctx,
+                   const DenseTensor* dO,
+                   const DenseTensor* indices,
+                   DenseTensor* dX,
+                   const IndType num_rows,
+                   const IndType num_cols) {
+  auto cu_stream = ctx.stream();
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(dO->data<T>(),
+                                                    indices->data<IndType>(),
+                                                    dX->data<T>(),
+                                                    num_rows,
+                                                    num_cols);
+}
+
+template <typename T>
+void ArgFlattenAssign(const phi::GPUContext& ctx,
+                      const DenseTensor* dO,
+                      const DenseTensor* indices,
+                      int64_t size,
+                      DenseTensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  dev_ctx.template Alloc<T>(in_grad);
+  if (out_grad.numel() == 0) return;
+  auto in_dims = in_grad->dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  int64_t size = in_grad->numel();
+
+  // Parallel acceleration when the input size is equal to the length of the
+  // ‘axis’ dimension.
+  // Compared to 'special case for full sort' below, the gradient calculation
+  // is 10 times faster.
+  if (size == in_dims[axis]) {
+    ArgFlattenAssign<T>(dev_ctx, &out_grad, &indices, size, in_grad);
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &out_grad, &indices, in_grad, input_height, input_width);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &trans_dO, &trans_ind, &tmp_out, input_height, input_width);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a9c1e275998b882e3146e8969dbfe266aebc7a5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -0,0 +1,310 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+// Iter for move to next row
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (T j = row_id; j < num_rows; j += gridDim.x) {
+    for (T i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+// Sort by flag descending, True: descending. False: Ascending.
+// Default is false.
+template <typename T, typename IndType>
+void ArgFullSort(const phi::GPUContext& ctx,
+                 const DenseTensor* input,
+                 DenseTensor* output,
+                 DenseTensor* indices,
+                 const IndType num_rows,
+                 const IndType num_cols,
+                 const bool descending) {
+  auto cu_stream = ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<IndType> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  ctx.template Alloc<IndType>(&input_indices);
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  // Init a index array
+  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<IndType>(), num_rows, num_cols);
+
+  T* sorted_out_ptr;
+  IndType* sorted_indices_ptr;
+  const T* inp = input->data<T>();
+  T* out = ctx.template Alloc<T>(output);
+  IndType* ind = ctx.template Alloc<IndType>(indices);
+  sorted_out_ptr = out;
+  sorted_indices_ptr = ind;
+
+  // create iter for counting input
+  cub::CountingInputIterator<IndType> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<IndType,
+                              SegmentOffsetIter,
+                              cub::CountingInputIterator<IndType>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  gpuError_t err;
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr,
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+
+  DenseTensor temp_storage;
+  int64_t temp_size = temp_storage_bytes;
+  temp_storage.Resize({temp_size});
+  ctx.template Alloc<uint8_t>(&temp_storage);
+
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(),
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.data<uint8_t>(),
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  const T* in_data = input.data<T>();
+  auto size = input.numel();
+  T* out_data = dev_ctx.template Alloc<T>(output);
+  int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  // Compared to the following 'Special case for full sort', ascending sort is
+  // 34 times faster and descending sort is 31 times faster.
+  if (size == in_dims[axis]) {
+    thrust::sequence(thrust::device, ids_data, ids_data + size);
+    thrust::copy(thrust::device, in_data, in_data + size, out_data);
+    thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+    if (descending) {
+      thrust::reverse(thrust::device, out_data, out_data + size);
+      thrust::reverse(thrust::device, ids_data, ids_data + size);
+    }
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &input,
+                            output,
+                            indices,
+                            input_height,
+                            input_width,
+                            descending);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    T* trans_inp_data = dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    // temp indices for sorting
+    tmp_indices.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    dev_ctx.template Alloc<int64_t>(indices);
+
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &trans_inp,
+                            &tmp_out,
+                            &tmp_indices,
+                            input_height,
+                            input_width,
+                            descending);
+
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
index 1cc3311c3639820ef9b6d3a29d9274ac93bb5963..6652d242de5ce44f3bf64d91e6fae16c648c2726 100644
--- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
index 702c959b78f75d0e52511d9bdc9d4330c6838aa4..dd0bba177defef7cdbd41ef7944110d126ca2d7c 100644
--- a/paddle/phi/kernels/gpu/atan2_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/atan2_kernel.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a1bb9874fe19a21c46994b06779e6ed2ef5dc4f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -0,0 +1,258 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/auc_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+__global__ void ClearObsoleteDataKernel(int64_t *pos,
+                                        int64_t *neg,
+                                        const int bucket_length,
+                                        const int slide_steps) {
+  int cur_step_index =
+      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  CUDA_KERNEL_LOOP(i, bucket_length) {
+    pos[sum_step_begin + i] -= pos[cur_step_begin + i];
+    neg[sum_step_begin + i] -= neg[cur_step_begin + i];
+    pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0;
+  }
+}
+
+__global__ void UpdateSumDataKernel(int64_t *pos,
+                                    int64_t *neg,
+                                    const int bucket_length,
+                                    const int slide_steps) {
+  int cur_step_index =
+      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  CUDA_KERNEL_LOOP(i, bucket_length) {
+    pos[sum_step_begin + i] += pos[cur_step_begin + i];
+    neg[sum_step_begin + i] += neg[cur_step_begin + i];
+  }
+}
+
+template <typename T>
+__global__ void AddDataKernel(const int64_t *label_data,
+                              const T *pred_data,
+                              const int inference_width,
+                              const int num_thresholds,
+                              int64_t *pos,
+                              int64_t *neg,
+                              const int numel,
+                              const int slide_steps) {
+  int cur_step_begin = 0;
+  if (slide_steps > 0) {
+    int cur_step_index =
+        static_cast<int>(pos[(slide_steps + 1) * (1 + num_thresholds)]) %
+        slide_steps;
+    cur_step_begin = cur_step_index * (1 + num_thresholds);
+  }
+  CUDA_KERNEL_LOOP(i, numel) {
+    auto predict_data = pred_data[i * inference_width + (inference_width - 1)];
+    PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1.");
+    PADDLE_ENFORCE(predict_data >= 0,
+                   "The predict data must gather or equal 0.");
+    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+    if (label_data[i]) {
+      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
+    } else {
+      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
+    }
+  }
+}
+
+__global__ void CalcAucKernel(int64_t *stat_pos,
+                              int64_t *stat_neg,
+                              int num_thresholds,
+                              double *auc,
+                              bool need_add_batch_num) {
+  *auc = 0.0f;
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+
+  int idx = num_thresholds;
+
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += stat_pos[idx];
+    totNeg += stat_neg[idx];
+    *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0;
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    *auc = *auc / totPos / totNeg;
+  }
+  if (need_add_batch_num) {
+    stat_pos[num_thresholds + 1] += 1;
+    stat_neg[num_thresholds + 1] += 1;
+  }
+}
+
+inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) {
+  return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+}
+
+template <typename T, typename Context>
+void statAuc(const Context &dev_ctx,
+             const DenseTensor &label,
+             const DenseTensor &predict,
+             const int num_thresholds,
+             const int slide_steps,
+             int64_t *origin_stat_pos,
+             int64_t *origin_stat_neg) {
+  size_t batch_size = predict.dims()[0];
+  size_t inference_width = predict.dims()[1];
+  const T *inference_data = predict.data<T>();
+  const auto *label_data = label.data<int64_t>();
+  const int bucket_length = num_thresholds + 1;
+
+  if (slide_steps == 0) {
+    AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
+                        PADDLE_CUDA_NUM_THREADS,
+                    PADDLE_CUDA_NUM_THREADS,
+                    0,
+                    dev_ctx.stream()>>>(label_data,
+                                        inference_data,
+                                        inference_width,
+                                        num_thresholds,
+                                        origin_stat_pos,
+                                        origin_stat_neg,
+                                        batch_size,
+                                        slide_steps);
+    return;
+  }
+  // the last number of origin_stat_pos store the index should be used in
+  // current step
+  int cur_step_index =
+      static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
+      slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+
+  ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
+                                PADDLE_CUDA_NUM_THREADS,
+                            PADDLE_CUDA_NUM_THREADS,
+                            0,
+                            dev_ctx.stream()>>>(
+      origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
+
+  AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
+                      PADDLE_CUDA_NUM_THREADS,
+                  PADDLE_CUDA_NUM_THREADS,
+                  0,
+                  dev_ctx.stream()>>>(label_data,
+                                      inference_data,
+                                      inference_width,
+                                      num_thresholds,
+                                      origin_stat_pos,
+                                      origin_stat_neg,
+                                      batch_size,
+                                      slide_steps);
+  UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
+                            PADDLE_CUDA_NUM_THREADS,
+                        PADDLE_CUDA_NUM_THREADS,
+                        0,
+                        dev_ctx.stream()>>>(
+      origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
+}
+
+template <typename T, typename Context>
+void AucKernel(const Context &dev_ctx,
+               const DenseTensor &input,
+               const DenseTensor &label,
+               const DenseTensor &stat_pos,
+               const DenseTensor &stat_neg,
+               const std::string &curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor *auc,
+               DenseTensor *stat_pos_out,
+               DenseTensor *stat_neg_out) {
+  // Only use output var for now, make sure it's persistable and
+  // not cleaned up for each batch.
+  auto *origin_stat_pos = dev_ctx.template Alloc<int64_t>(stat_pos_out);
+  auto *origin_stat_neg = dev_ctx.template Alloc<int64_t>(stat_neg_out);
+  auto *auc_value = dev_ctx.template Alloc<double>(auc);
+
+  auto *stat_pos_in_tensor = &stat_pos;
+  auto *stat_neg_in_tensor = &stat_neg;
+  auto *pos_in_data = stat_pos.data<int64_t>();
+  auto *neg_in_data = stat_neg.data<int64_t>();
+#ifdef PADDLE_WITH_CUDA
+  if (stat_pos_in_tensor != stat_pos_out) {
+    cudaMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        cudaMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    cudaMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        cudaMemcpyDeviceToDevice);
+  }
+#else
+  if (stat_pos_in_tensor != stat_pos_out) {
+    hipMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        hipMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    hipMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        hipMemcpyDeviceToDevice);
+  }
+#endif
+
+  statAuc<T, Context>(dev_ctx,
+                      label,
+                      input,
+                      num_thresholds,
+                      slide_steps,
+                      origin_stat_pos,
+                      origin_stat_neg);
+  int sum_offset = slide_steps * (num_thresholds + 1);
+  CalcAucKernel<<<1, 1, 0, dev_ctx.stream()>>>(origin_stat_pos + sum_offset,
+                                               origin_stat_neg + sum_offset,
+                                               num_thresholds,
+                                               auc_value,
+                                               slide_steps > 0);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(auc, GPU, ALL_LAYOUT, phi::AucKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c9ee5ede010367697bb9477a536f807625fd02b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -0,0 +1,1038 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon,
+                                        const int C,
+                                        const int HxW,
+                                        const int num,
+                                        T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T>
+static __global__ void KeBNRestoreData(const phi::DataLayout layout,
+                                       T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *variance,
+                                       double epsilon,
+                                       int C,
+                                       int M,
+                                       const int num,
+                                       const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+
+template <typename T>
+class InplaceHelper {
+ public:
+  void operator()(const phi::DataLayout layout,
+                  T *x,
+                  const BatchNormParamType<T> *scale,
+                  const BatchNormParamType<T> *bias,
+                  const BatchNormParamType<T> *mean,
+                  const BatchNormParamType<T> *variance,
+                  double epsilon,
+                  int C,
+                  int M,
+                  const int num,
+                  const T *y,
+                  int grid2,
+                  const int block,
+                  const gpuStream_t &stream) {
+    PADDLE_ENFORCE_EQ(x,
+                      y,
+                      phi::errors::InvalidArgument(
+                          "X and Y should be inplaced in inplace mode"));
+    KeBNRestoreData<<<grid2, block, 0, stream>>>(
+        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
+  }
+};
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    T *dx,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == phi::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean,
+    const T *x,
+    const BatchNormParamType<T> *variance,
+    const int C,
+    const int N,
+    const int HxW,
+    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> inv_var_i = variance[i];
+    BatchNormParamType<T> mean_i = mean[i];
+    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> dy_x_sub_mean_sum =
+        static_cast<BatchNormParamType<T>>(0);
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      dy_sum += dy_i;
+      dy_x_sub_mean_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+    }
+
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
+                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] =
+          (static_cast<BatchNormParamType<T>>(dy[index]) -
+           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
+           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
+               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
+          scale[i] * inv_var_i;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context &ctx,
+                            const DenseTensor &y_grad,
+                            const DenseTensor &x,
+                            const DenseTensor &scale,
+                            const DenseTensor &bias,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            paddle::optional<const DenseTensor &> reserve_space,
+                            paddle::optional<const DenseTensor &> mean,
+                            paddle::optional<const DenseTensor &> variance,
+                            float momentum,
+                            float epsilon_f,
+                            const std::string &data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor *x_grad,
+                            DenseTensor *scale_grad,
+                            DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto *d_y = &y_grad;
+
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  const auto &x_dims = x.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5."
+          "But received: the size of input's dimensions is [%d],"
+          "the dimensions of input is [%s]",
+          x_dims.size(),
+          x_dims));
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  if (d_scale && d_bias) {
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_EQ(
+      scale.dims().size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "The size of scale's dimensions must equal to 1. But received: "
+          "the size of scale's dimensions is [%d], the dimensions of scale "
+          "is [%s].",
+          scale.dims().size(),
+          scale.dims()));
+  PADDLE_ENFORCE_EQ(
+      scale.dims()[0],
+      C,
+      phi::errors::InvalidArgument(
+          "The first dimension of scale must equal to Channels[%d]. But "
+          "received: the first dimension of scale is [%d]",
+          C,
+          scale.dims()[0]));
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
+                                    FLAGS_cudnn_batchnorm_spatial_persistent &&
+                                    (reserve_space.get_ptr() != nullptr);
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_d_y(d_y->type());
+  DenseTensor transformed_d_x;
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    if (d_x) {
+      ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
+    }
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_d_y.ShareDataWith(*d_y);
+    if (d_x) {
+      transformed_d_x.ShareDataWith(*d_x);
+    }
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  }
+
+  const int num = transformed_x.numel();
+#ifdef HIPCC
+  const int block = 256;
+#else
+  const int block = 512;
+#endif
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid1 = (num + block - 1) / block;
+  int grid2 = std::min(C, max_blocks);
+  auto stream = ctx.stream();
+  InplaceHelper<T> inplace_functor;
+
+  if (!use_global_stats) {
+    if ((N * H * W * D) == 1) {
+      if (d_x) {
+        paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+      }
+      phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
+      functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(
+            &bn_param_desc_));
+#endif
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    } else if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#else
+    if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_,
+            CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4,
+            dims.data(),
+            strides.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+            bn_param_desc_, data_desc_, mode_));
+#endif
+
+    const auto *saved_mean_data =
+        saved_mean.template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_variance.template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      inplace_functor(compute_format,
+                      transformed_x.data<T>(),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      saved_mean_data,
+                      saved_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      transformed_x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    // This branch calls CUDNN APIs
+    if (d_x && d_scale && d_bias) {
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      auto reserve_space_size = reserve_space->memory_size();
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+                  /*handle=*/ctx.cudnn_handle(),
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*yDesc=*/data_desc_,
+                  /*dyDesc=*/data_desc_,
+                  /*dzDesc=*/nullptr,
+                  /*dxDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
+              /*handle=*/ctx.cudnn_handle(),
+              /*mode=*/mode_,
+              /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+              /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+              /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+              /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+              /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+              /*xDesc=*/data_desc_,
+              /*xData=*/transformed_x.template data<T>(),
+              /*yDesc=*/nullptr,
+              /*yData=*/nullptr,
+              /*dyDesc=*/data_desc_,
+              /*dyData=*/transformed_d_y.template data<T>(),
+              /*dzDesc=*/nullptr,
+              /*dzData=*/nullptr,
+              /*dxDesc=*/data_desc_,
+              /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
+              /*dBnScaleBiasDesc=*/bn_param_desc_,
+              /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
+              /*bnBiasData=*/nullptr,
+              /*dBnScaleData=*/d_scale
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*dBnBiasData=*/d_bias
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*epsilon=*/epsilon,
+              /*savedMean=*/saved_mean_data,
+              /*savedInvVariance=*/saved_var_data,
+              /*activationDesc=*/nullptr,
+              /*workspace=*/workspace_ptr,
+              /*workSpaceSizeInBytes=*/workspace_size,
+              /*reserveSpace=*/const_cast<T *>(
+                  reserve_space->template data<T>()),
+              /*reserveSpaceSizeInBytes=*/reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        if (compute_format == DataLayout::kNCHW) {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        } else {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationBackward(
+                ctx.cudnn_handle(),
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                transformed_d_y.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_d_x),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean_data,
+                saved_var_data));
+#endif
+      }
+
+      if (data_layout == DataLayout::kNHWC &&
+          compute_format == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+        TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
+      }
+    } else {
+      // This branch call CUDA kernels
+      if (compute_format == DataLayout::kNCHW) {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+    // clean when exit.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
+            bn_param_desc_));
+#endif
+  } else {
+    const auto *running_mean = mean.get_ptr();
+    const auto *running_var = variance.get_ptr();
+
+    const auto *running_mean_data =
+        running_mean->template data<BatchNormParamType<T>>();
+    const auto *running_var_data =
+        running_var->template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      auto px = x;
+      inplace_functor(data_layout,
+                      ctx.template Alloc<T>(&px),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      running_mean_data,
+                      running_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    if (compute_format == DataLayout::kNCHW) {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    } else {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &y_grad,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &saved_mean,
+                         const DenseTensor &saved_variance,
+                         paddle::optional<const DenseTensor &> reserve_space,
+                         paddle::optional<const DenseTensor &> mean,
+                         paddle::optional<const DenseTensor &> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string &data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context &ctx,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x,
+                               const DenseTensor &scale,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               paddle::optional<const DenseTensor &> mean,
+                               paddle::optional<const DenseTensor &> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string &data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor *x_grad,
+                               DenseTensor *scale_grad,
+                               DenseTensor *y_grad_grad) {
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const DenseTensor *running_mean = nullptr;
+  const DenseTensor *running_variance = nullptr;
+  if (use_global_stats) {
+    running_mean = mean.get_ptr();
+    running_variance = variance.get_ptr();
+  }
+  paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
+                                                       data_layout,
+                                                       &x,
+                                                       &scale,
+                                                       &y_grad,
+                                                       &saved_mean,
+                                                       &saved_variance,
+                                                       running_mean,
+                                                       running_variance,
+                                                       epsilon,
+                                                       use_global_stats,
+                                                       &x_grad_grad,
+                                                       &scale_grad_grad,
+                                                       &bias_grad_grad,
+                                                       x_grad,
+                                                       scale_grad,
+                                                       y_grad_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+
+#else
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..49b550f51e60e1cf31658f0d50afebf929a54079
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -0,0 +1,684 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardInference(const T *x,
+                                          const BatchNormParamType<T> *mean,
+                                          const BatchNormParamType<T> *variance,
+                                          const BatchNormParamType<T> *scale,
+                                          const BatchNormParamType<T> *bias,
+                                          const int C,
+                                          const int N,
+                                          const int HxW,
+                                          const double epsilon,
+                                          T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context &ctx,
+                     const DenseTensor &x,
+                     const DenseTensor &scale,
+                     const DenseTensor &bias,
+                     const DenseTensor &mean,
+                     const DenseTensor &variance,
+                     float momentum,
+                     float epsilon_f,
+                     const std::string &data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor *y,
+                     DenseTensor *mean_out,
+                     DenseTensor *variance_out,
+                     DenseTensor *saved_mean,
+                     DenseTensor *saved_variance,
+                     DenseTensor *reserve_space) {
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_stats);
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5"
+          "But received: the size of input's dimensions is [%d]",
+          x_dims.size()));
+
+  ctx.template Alloc<T>(y);
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm =
+      test_mode ||
+      (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
+
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_y(y->type());
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_y.ShareDataWith(*y);
+  }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+  if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+  } else if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#else
+  if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * D * C, 1, W * D * C, D * C, C};
+  }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  // Note: PERSISTENT not implemented for inference
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_,
+          data_desc_,
+          test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+#endif
+
+  auto handle = ctx.cudnn_handle();
+
+  // Now, depending on whether we are running test or not, we have two paths.
+  // It is training mode when it's not reference AND not using pre-trained
+  // model.
+  bool training = !test_mode && !use_global_stats;
+  if (!training) {
+    // only when test we use input to do computation.
+    const auto *est_mean = &mean;
+    const auto *est_var = &variance;
+    // Run inference mode.
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of mean's dimensions must equal to 1."
+            "But received: the size of mean's dimensions mean is [%d],"
+            "the dimensions of mean is [%s].",
+            est_mean->dims().size(),
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of variance's dimensions must equal to 1."
+            "But received: the size of variance's dimensions is [%d],"
+            "the dimensions of variance is [%s].",
+            est_var->dims().size(),
+            est_var->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of mean must equal to the number of "
+            "Channels, which is [%d]. But received: the first dimension"
+            "of mean is [%d], the dimensions of mean is [%s].",
+            C,
+            est_mean->dims()[0],
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of variance must equal to the number"
+            "of Channels, which is [%d]. But received: the first dimension of"
+            "variance is [%d], the dimensions of variance is [%s].",
+            C,
+            est_var->dims()[0],
+            est_var->dims()));
+
+#ifdef PADDLE_WITH_HIP
+    const int block_size = 256;
+    const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+    if (compute_format == DataLayout::kNCHW) {
+      BNForwardInference<
+          T,
+          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    } else {
+      BNForwardInference<
+          T,
+          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
+            handle,
+            // Note: PERSISTENT not implemented for inference
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            transformed_x.template data<T>(),
+            data_desc_,
+            ctx.template Alloc<T>(&transformed_y),
+            bn_param_desc_,
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            epsilon));
+#endif
+  } else {
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    // need to solve here
+    // if (ctx.HasInput("MomentumTensor")) {
+    //   const auto *mom_tensor = MomentumTensor;
+    //   DenseTensor mom_cpu;
+    //   paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+    //                                     &mom_cpu);
+    //   momentum = mom_cpu.data<float>()[0];
+    // }
+
+    // Run training mode.
+    // obtain running mean and running inv var, and there is no need
+    // to initialize them.
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    if ((N * H * W * D) == 1) {
+      // Only 1 element in normalization dimension,
+      // skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+    } else {
+      double this_factor = 1. - momentum;
+
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      size_t reserve_space_size = 0;
+      void *reserve_space_ptr = nullptr;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      DenseTensor reserve_space_tensor;
+      // Create reserve space and workspace for batch norm.
+      // Create tensor for each batchnorm op, it will be used in the
+      // backward. Thus this tensor shouldn't be temp.
+      // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      if (reserve_space == nullptr) {
+        reserve_space = &reserve_space_tensor;
+      }
+      PADDLE_ENFORCE_NOT_NULL(
+          reserve_space,
+          phi::errors::NotFound(
+              "The argument ReserveSpace of batch_norm op is not found."));
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*zDesc=*/nullptr,
+                  /*yDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      // -------------- cudnn batchnorm reserve space --------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*activationDesc=*/nullptr,
+                  /*xDesc=*/data_desc_,
+                  /*sizeInBytes=*/&reserve_space_size));
+
+      reserve_space_ptr = reserve_space->mutable_data(
+          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+              handle,
+              mode_,
+              CUDNN_BATCHNORM_OPS_BN,
+              CudnnDataType<T>::kOne(),
+              CudnnDataType<T>::kZero(),
+              data_desc_,
+              transformed_x.template data<T>(),
+              nullptr,
+              nullptr,
+              data_desc_,
+              transformed_y.template data<T>(),
+              bn_param_desc_,
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              this_factor,
+              mean_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              variance_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              epsilon,
+              saved_mean->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              saved_variance->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              nullptr,
+              workspace_ptr,
+              workspace_size,
+              reserve_space_ptr,
+              reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        const int num = transformed_x.numel();
+        const int block = 256;
+        const int max_threads = ctx.GetMaxPhysicalThreadCount();
+        const int max_blocks = std::max(max_threads / block, 1);
+        const int grid = std::min(C, max_blocks);
+        if (compute_format == DataLayout::kNCHW) {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        } else {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                mean_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                variance_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                saved_variance->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace())));
+#endif
+      }
+    }
+  }
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    TransToChannelLast<Context, T>(ctx, &transformed_y, y);
+  }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9c62026edfa7ad4cd3124cc5a612f8220ab00f5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_utils.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index ac69d398b8ac44513625d2caeac2d80d5578ea6a..79d8a7b0f3444b4272d1affd67bd5ac943f2c1cc 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/bernoulli_kernel.h"
+
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #ifdef __NVCC__
@@ -28,10 +30,9 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/bernoulli_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/platform/transform.h"
 
 DECLARE_bool(use_curand);
@@ -77,7 +78,7 @@ __global__ void bernoulli_cuda_kernel(
 
   size_t total_thread = gridDim.x * blockDim.x;
   for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) {
-    paddle::distribution::uniform_distribution<float> dist;
+    funcs::uniform_distribution<float> dist;
     float4 rand = dist(&state);
 #pragma unroll
     for (size_t j = 0; j < 4; j++) {
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4ec894790cd38d4c769021b786ab2e756d9bd30
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename InputT, typename OutT>
+__global__ void KernelBincount(const InputT* input,
+                               const int total_elements,
+                               const bool has_weights,
+                               const T* weights,
+                               OutT* output) {
+  if (!has_weights) {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+    }
+  } else {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]],
+                                      static_cast<OutT>(weights[i]));
+    }
+  }
+}
+
+template <typename Context, typename T, typename InputT>
+void BincountCUDAInner(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const paddle::optional<const DenseTensor&> weights,
+                       int minlength,
+                       DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  const int input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<T>(output);
+    return;
+  }
+  auto input_x = EigenVector<InputT>::Flatten(*input);
+  DenseTensor input_min_t, input_max_t;
+  input_max_t.Resize({1});
+  auto* input_max_data = dev_ctx.template Alloc<InputT>(&input_max_t);
+  input_min_t.Resize({1});
+  auto* input_min_data = dev_ctx.template Alloc<InputT>(&input_min_t);
+
+  auto input_max_scala = EigenScalar<InputT>::From(input_max_t);
+  auto input_min_scala = EigenScalar<InputT>::From(input_min_t);
+
+  auto* place = dev_ctx.eigen_device();
+  input_max_scala.device(*place) = input_x.maximum();
+  input_min_scala.device(*place) = input_x.minimum();
+
+  DenseTensor input_min_cpu, input_max_cpu;
+  paddle::framework::TensorCopySync(
+      input_max_t, phi::CPUPlace(), &input_max_cpu);
+  paddle::framework::TensorCopySync(
+      input_min_t, phi::CPUPlace(), &input_min_cpu);
+
+  InputT input_min = input_min_cpu.data<InputT>()[0];
+
+  PADDLE_ENFORCE_GE(
+      input_min,
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size =
+      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
+
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
+  auto stream = dev_ctx.stream();
+
+  if (!has_weights) {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+
+    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+        input_data, input_numel, has_weights, weights_data, output_data);
+  } else {
+    const auto& weights_type =
+        paddle::framework::TransToProtoVarType(weights->dtype());
+
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+
+      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
+                                          PADDLE_CUDA_NUM_THREADS,
+                                          0,
+                                          stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountCUDAInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountCUDAInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/bitwise_kernel.cu b/paddle/phi/kernels/gpu/bitwise_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e88ecef318a874d338094dfc9575b732cdd7680a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bitwise_kernel.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/bitwise_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/bitwise_functors.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+namespace phi {
+
+#define DEFINE_BITWISE_KERNEL(op_type)                      \
+  template <typename T, typename Context>                   \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,     \
+                                const DenseTensor& x,       \
+                                const DenseTensor& y,       \
+                                DenseTensor* out) {         \
+    dev_ctx.template Alloc<T>(out);                         \
+    funcs::Bitwise##op_type##Functor<T> func;               \
+    std::vector<const DenseTensor*> ins = {&x, &y};         \
+    std::vector<DenseTensor*> outs = {out};                 \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>( \
+        dev_ctx, ins, &outs, -1, func);                     \
+  }
+
+DEFINE_BITWISE_KERNEL(And)
+DEFINE_BITWISE_KERNEL(Or)
+DEFINE_BITWISE_KERNEL(Xor)
+#undef DEFINE_BITWISE_KERNEL
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BitwiseNotFunctor<T> func;
+  funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+      dev_ctx, ins, &outs, -1, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bitwise_and,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseAndKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_or,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseOrKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_xor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseXorKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_not,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseNotKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d4850b74477d29e868698e000fdc01e708d172b5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<const DenseTensor*>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(
+      num_ins,
+      out_tensors.size(),
+      errors::InvalidArgument(
+          "BroadcastTensorsOp expects equal number of inputs and outputs,"
+          "but received: %d inputs v.s %d outputs",
+          num_ins,
+          out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    auto* input_tensor = in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const DDim& input_dims = input_tensor->dims();
+    const DDim& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // Collect reduce_dims
+    // Example:
+    // dX  = [1,1,1,1]
+    // dOut = [1,1,1,4]
+    //
+    // reduce_dims  = [3] // reduce along the broadcasted axis
+    std::vector<int> reduce_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // Turns out to be a No-Op, simply copy tensors
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      // reduce_sum implementation on CUDA
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx,
+          *input_tensor,
+          output_tensor,
+          kps::IdentityFunctor<T>(),
+          reduce_dims_vec);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa45bd3c4389177a07b5228319940e9b840fe1b2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 7a6c99c5fe15f6ddecd190d2d77e359503be7a80..542234c80b5a1e945aec7c8342d31ef9b676cce8 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -20,11 +20,11 @@
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
 
@@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx,
         paddle::experimental::DataType::UNDEFINED);     \
   }
 
-#if !defined(PADDLE_WITH_HIP)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16)
-#else
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
-#endif
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..82b1282cc36dc01a6d8b5cad31f6c97c570858f3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// backward reuse forward, HIP not support forward
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f1c91f3824780060852aef6e96d97b0e635f37a5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+void cusolver_potrs(const solverHandle_t &handle,
+                    cublasFillMode_t uplo,
+                    int M,
+                    int N,
+                    T *Adata,
+                    int lda,
+                    T *Bdata,
+                    int ldb,
+                    int *devInfo);
+
+template <>
+void cusolver_potrs<float>(const solverHandle_t &handle,
+                           cublasFillMode_t uplo,
+                           int M,
+                           int N,
+                           float *Adata,
+                           int lda,
+                           float *Bdata,
+                           int ldb,
+                           int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<double>(const solverHandle_t &handle,
+                            cublasFillMode_t uplo,
+                            int M,
+                            int N,
+                            double *Adata,
+                            int lda,
+                            double *Bdata,
+                            int ldb,
+                            int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<float>>(
+    const solverHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<float> *Adata,
+    int lda,
+    phi::dtype::complex<float> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCpotrs(handle,
+                                uplo,
+                                M,
+                                N,
+                                reinterpret_cast<const cuComplex *>(Adata),
+                                lda,
+                                reinterpret_cast<cuComplex *>(Bdata),
+                                ldb,
+                                devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<double>>(
+    const cusolverDnHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<double> *Adata,
+    int lda,
+    phi::dtype::complex<double> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
+      handle,
+      uplo,
+      M,
+      N,
+      reinterpret_cast<const cuDoubleComplex *>(Adata),
+      lda,
+      reinterpret_cast<cuDoubleComplex *>(Bdata),
+      ldb,
+      devInfo));
+}
+
+template <typename T>
+class CholeskySolveFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    cublasFillMode_t uplo =
+        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+    auto handle = dev_ctx.cusolver_dn_handle();
+    cusolver_potrs<T>(handle, uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..225164687b75ca88f0c0783d6bdabea227c076ae
--- /dev/null
+++ b/paddle/phi/kernels/gpu/compare_kernel.cu
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/compare_kernel.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+
+#include <thrust/fill.h>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+struct BitwiseAdd {
+  // Bitwise add operator, returns <tt>a + b</tt>
+  inline T initial() { return static_cast<T>(true); }
+
+  __host__ __device__ __forceinline__ T operator()(const T& a,
+                                                   const T& b) const {
+    return a & b;
+  }
+};
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out) {
+  ctx.template Alloc<bool>(out);
+  std::vector<const DenseTensor*> ins{&x, &y};
+  std::vector<DenseTensor*> outs{out};
+  funcs::BroadcastKernel<ElementwiseType::kBinary, T, bool>(
+      ctx, ins, &outs, axis, Functor());
+}
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  bool* out_data = ctx.template Alloc<bool>(out);
+
+  if (x.dims() != y.dims()) {
+    thrust::device_ptr<bool> out_dev_ptr(out_data);
+    thrust::fill(out_dev_ptr, out_dev_ptr + 1, false);
+    return;
+  }
+
+  DenseTensor tmp;
+  tmp.Resize(x.dims());
+  ctx.template Alloc<bool>(&tmp);
+
+  std::vector<const DenseTensor*> ins{&x, &y};
+  std::vector<DenseTensor*> outs{&tmp};
+  funcs::ElementwiseKernel<bool>(ctx, ins, &outs, Functor());
+
+  // Reduce by 'bitwise and' operator
+  std::vector<int> reduce_dims;
+  reduce_dims.resize(tmp.dims().size());
+  for (int i = 0; i < reduce_dims.size(); ++i) {
+    reduce_dims[i] = i;
+  }
+  funcs::ReduceKernel<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
+      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(less_than,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LessThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(less_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LessEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_than,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GreaterThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GreaterEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(not_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NotEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(equal_all,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EqualAllKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index d0b086718a4446056794226a361f469e5740df4a..e03e079581a9bc7ba26b7cb9d78543f5f59c3dc2 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -38,11 +38,15 @@ PD_REGISTER_KERNEL(real,
                    ALL_LAYOUT,
                    phi::RealKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
 
 PD_REGISTER_KERNEL(imag,
                    GPU,
                    ALL_LAYOUT,
                    phi::ImagKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 2b04b979c20aa71cc723610d013cd12fb5537a29..accb1cc3d77e3ccd14b4d7808b781cf255eddd06 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -29,16 +29,16 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis_scalar,
                   DenseTensor* out) {
   int64_t axis = axis_scalar.to<int64_t>();
 
-  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
 
   std::vector<phi::DDim> x_dims;
   for (size_t i = 0; i < x.size(); ++i) {
-    x_dims.push_back(x[i].dims());
+    x_dims.push_back(x[i]->dims());
   }
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
@@ -46,13 +46,13 @@ void ConcatKernel(const Context& dev_ctx,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
-  if (axis == 0 && x[0].lod().size() > 0) {
-    size_t lod_size_0 = x[0].lod().size();
+  if (axis == 0 && x[0]->lod().size() > 0) {
+    size_t lod_size_0 = x[0]->lod().size();
     size_t lod_size = lod_size_0;
     for (size_t i = 1; i < x.size(); ++i) {
-      if (x[i].lod().size() > 0) {
+      if (x[i]->lod().size() > 0) {
         PADDLE_ENFORCE_EQ(
-            x[i].lod().size(),
+            x[i]->lod().size(),
             lod_size_0,
             phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
@@ -60,7 +60,7 @@ void ConcatKernel(const Context& dev_ctx,
                 "it is not supported currently. The lod level of %dth input "
                 "is %d and first input is %d.",
                 i,
-                x[i].lod().size(),
+                x[i]->lod().size(),
                 lod_size_0));
       } else {
         lod_size = 0;
@@ -70,7 +70,7 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod());
+        auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod());
         phi::AppendLoD(out_lod, in_lod);
       }
     }
@@ -79,18 +79,18 @@ void ConcatKernel(const Context& dev_ctx,
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && x.size() < 10) {
     size_t output_offset = 0;
-    for (auto& in : x) {
-      if (in.numel() == 0UL) {
+    for (auto* in : x) {
+      if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in.dims());
+      auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
       paddle::operators::StridedNumelCopyWithAxis<T>(
           dev_ctx,
           axis,
           out->data<T>() + output_offset,
           out_stride,
-          in.data<T>(),
+          in->data<T>(),
           in_stride,
           in_stride[axis]);
       output_offset += in_stride[axis];
@@ -98,8 +98,8 @@ void ConcatKernel(const Context& dev_ctx,
   } else {
     std::vector<phi::DenseTensor> inputs;
     for (size_t j = 0; j < x.size(); ++j) {
-      if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+      if (x[j]->numel() > 0) {
+        inputs.push_back(*x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6449a193a082e5d17926de9252a79e4c069be224
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4df7bb26adf845b4a8f52f1c92beb8621002b3da
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(
+    conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..680ee4426af0661d39d4c7bd0abf9e52c4594995
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+
+PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/conv_test_kernel.cu b/paddle/phi/kernels/gpu/conv_test_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0544a1e298b8e7dc871d13f546398a5c28308b0e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu
@@ -0,0 +1,13 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a253e6f4ad290bc45dee8f57afee06363042a8c5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -0,0 +1,336 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, int BLOCK_SIZE>
+__device__ void BlockReverse(
+    const T* idata, T* odata, int src_base, int dst_base, int valid_item) {
+  __shared__ T sh_mem[BLOCK_SIZE];
+  int tx = threadIdx.x;
+
+  int offset = tx;
+  int in_index = src_base + offset;
+  if (offset >= valid_item) {
+    sh_mem[offset] = 0;
+  } else {
+    int sh_mem_index = BLOCK_SIZE - offset - 1;
+    T data = idata[in_index];
+    sh_mem[sh_mem_index] = data;
+  }
+
+  __syncthreads();
+  int out_index = dst_base - offset;
+  if (offset < valid_item) {
+    int sh_mem_index = BLOCK_SIZE - offset - 1;
+    odata[out_index] = sh_mem[sh_mem_index];
+  }
+}
+
+template <typename T>
+__global__ void MatrixRowReverse(const T* matrix_data,
+                                 T* reverse_data,
+                                 int reverse_size,
+                                 int outer_size,
+                                 int inner_size) {
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+  int item_per_block = 1024;
+
+  for (int block_offset = 0; block_offset < reverse_size;
+       block_offset += item_per_block) {
+    int valid_item = (reverse_size - block_offset > item_per_block)
+                         ? item_per_block
+                         : reverse_size - block_offset;
+    int src_offset =
+        bx * reverse_size + block_offset + by * (inner_size * reverse_size);
+    int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) +
+                     reverse_size - 1 - block_offset;
+    if (reverse_size < item_per_block) {
+      valid_item = reverse_size;
+    }
+
+    BlockReverse<T, 1024>(
+        matrix_data, reverse_data, src_offset, dst_offset, valid_item);
+  }
+}
+
+template <typename T>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total;
+  // Constructor
+  __device__ BlockPrefixCallbackOp(T running_total)
+      : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total;
+    running_total = old_prefix + block_aggregate;
+    return old_prefix;
+  }
+};
+
+// No bank-conflict transpose
+template <typename T, int TILE_DIM, int BLOCK_ROWS>
+__global__ void MatrixTranspose(T* odata,
+                                const T* idata,
+                                size_t height,
+                                size_t width) {
+  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
+
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+    if (x < width && (y + j) < height) {
+      tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x];
+    } else {
+      tile[threadIdx.y + j][threadIdx.x] = 0;
+    }
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * TILE_DIM + threadIdx.y;
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+    if (x < height && (y + j) < width) {
+      odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void BlockScanKernel(T* d_out,
+                                const T* d_in,
+                                int inner_size,
+                                int outer_size,
+                                int scan_size,
+                                bool exclusive) {
+  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
+  typedef cub::
+      BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
+          BlockLoadT;
+  typedef cub::
+      BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE>
+          BlockStoreT;
+  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
+  // Allocate type-safe, repurposable shared memory for collectives
+  __shared__ union {
+    typename BlockLoadT::TempStorage load;
+    typename BlockStoreT::TempStorage store;
+    typename BlockScanT::TempStorage scan;
+  } temp_storage;
+
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+
+  BlockPrefixCallbackOp<T> prefix_op(0);
+  T block_aggregate = static_cast<T>(0);
+
+  // Obtain this block's segment of consecutive keys (blocked across threads)
+  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
+  for (int block_offset = 0; block_offset < scan_size;
+       block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) {
+    int valid_item = (scan_size - block_offset > item_per_block)
+                         ? item_per_block
+                         : (scan_size - block_offset);
+    if (scan_size < item_per_block) {
+      valid_item = scan_size;
+    }
+
+    int offset = bx * scan_size + block_offset + by * (inner_size * scan_size);
+
+    T thread_keys[ITEMS_PER_THREAD];
+    BlockLoadT(temp_storage.load)
+        .Load(d_in + offset, thread_keys, valid_item, 0);
+
+    __syncthreads();
+    if (exclusive) {
+      T init_value = static_cast<T>(0);
+      BlockScanT(temp_storage.scan)
+          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+    } else {
+      BlockScanT(temp_storage.scan)
+          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+    }
+    __syncthreads();
+
+    BlockStoreT(temp_storage.store)
+        .Store(d_out + offset, thread_keys, valid_item);
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  auto out_dims = out->dims();
+  auto size = x.numel();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  const T* in_data = x.data<T>();
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  if (size == out_dims[axis]) {
+    if (reverse) {
+      thrust::device_ptr<const T> dev_ptr =
+          thrust::device_pointer_cast(in_data);
+      thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
+      if (exclusive) {
+        thrust::exclusive_scan(
+            thrust::device, vec.rbegin(), vec.rend(), out_data);
+      } else {
+        thrust::inclusive_scan(
+            thrust::device, vec.rbegin(), vec.rend(), out_data);
+      }
+      thrust::reverse(thrust::device, out_data, out_data + size);
+    } else {
+      if (exclusive) {
+        thrust::exclusive_scan(
+            thrust::device, in_data, in_data + size, out_data);
+      } else {
+        thrust::inclusive_scan(
+            thrust::device, in_data, in_data + size, out_data);
+      }
+    }
+    return;
+  }
+
+  size_t height = 1;
+  size_t width = 1;
+  for (size_t i = 0; i <= axis; i++) {
+    height *= out_dims[i];
+  }
+
+  for (size_t i = axis + 1; i < out_dims.size(); i++) {
+    width *= out_dims[i];
+  }
+  int scan_size = out_dims[axis];
+  bool transpose = (axis != out_dims.size() - 1);
+
+  int tile_size = 32;
+  dim3 blocks(32, 8);
+  dim3 transpose_grids((width + tile_size - 1) / tile_size,
+                       (height + tile_size - 1) / tile_size);
+  out->Resize(out_dims);
+  auto* tmp_data = out->data<T>();
+
+  T* next_in_data = out_data;
+  T* next_out_data = tmp_data;
+  if (transpose) {
+    MatrixTranspose<T, 32, 8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
+        out_data, in_data, height, width);
+    next_in_data = out_data;
+    next_out_data = tmp_data;
+  }
+  auto swap_ptr = [](T*& ptr1, T*& ptr2) {
+    T* tmp = ptr2;
+    ptr2 = ptr1;
+    ptr1 = tmp;
+  };
+  int outer_size = height / scan_size;
+  int inner_size = width;
+  // Consider the size of shared memory, here block size is 128
+  dim3 scan_grid(outer_size, inner_size);
+  dim3 reverse_grid = scan_grid;
+  if (reverse) {
+    if (transpose) {
+      reverse_grid.x = scan_grid.y;
+      reverse_grid.y = scan_grid.x;
+      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+          next_in_data, next_out_data, scan_size, outer_size, inner_size);
+      if (!transpose) next_in_data = tmp_data;
+      swap_ptr(next_in_data, next_out_data);
+    } else {
+      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+          in_data, out_data, scan_size, outer_size, inner_size);
+    }
+  }
+  if (!transpose && !reverse) {
+    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        out_data, in_data, outer_size, inner_size, scan_size, exclusive);
+
+  } else {
+    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        next_out_data,
+        next_in_data,
+        outer_size,
+        inner_size,
+        scan_size,
+        exclusive);
+  }
+  swap_ptr(next_in_data, next_out_data);
+  if (reverse) {
+    MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+        next_in_data, next_out_data, scan_size, outer_size, inner_size);
+    swap_ptr(next_in_data, next_out_data);
+  }
+  if (transpose) {
+    transpose_grids.x = (height + tile_size - 1) / tile_size;
+    transpose_grids.y = (width + tile_size - 1) / tile_size;
+    MatrixTranspose<T, 32, 8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
+        next_out_data, next_in_data, width, height);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/phi/kernels/gpu/depthwise_conv.h
similarity index 62%
rename from paddle/fluid/operators/math/depthwise_conv.cu
rename to paddle/phi/kernels/gpu/depthwise_conv.h
index a4665a8f9a62dde6bfdbad3b05d7065e05f0a92f..5270a4b2fdb8d77aa1dfb20a166a9676b007c93f 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
+#pragma once
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -21,7 +25,7 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/math/depthwise_conv.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -30,6 +34,58 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+using DataLayout = framework::DataLayout;
+
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* input_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* filter_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
 template <typename T>
 static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) {
   typedef cub::WarpReduce<T> WarpReduce;
@@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
@@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   }
   if (c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
+      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(input_data,
+                                                        filter_data,
+                                                        batch_size,
+                                                        output_channels,
+                                                        output_height,
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
     } else {
-      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
+      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(input_data,
+                                                        filter_data,
+                                                        batch_size,
+                                                        output_channels,
+                                                        output_height,
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           output_data);
     } else {
       KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           output_data);
     }
   }
@@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   int final_filter_multiplier = filter_multiplier;
@@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp(
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, final_filter_multiplier, filter_height,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     } else {
       KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, final_filter_multiplier, filter_height,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNCHW<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                               fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, c_filter_multiplier, filter_height,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     } else {
-      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNHWC<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                               fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, c_filter_multiplier, filter_height,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     }
   }
 }
@@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp(
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   T s = 0;
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
 
@@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
 
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   int bid = blockIdx.z;
   int image_h = blockIdx.y;
   int kernel_iw = blockIdx.x % filter_width;
@@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
 
 template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   const int bid = blockIdx.z;
   int image_h = blockIdx.x * dilate_height + blockIdx.y;
   if (image_h >= output_height) {
@@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
-__global__ void KernelDepthwiseConvFilterGradSp(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
+__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
+                                                const T* input_data,
+                                                const int num,
+                                                const int output_channels,
+                                                const int output_height,
+                                                const int output_width,
+                                                const int input_channels,
+                                                const int input_height,
+                                                const int input_width,
+                                                const int filter_multiplier,
+                                                const int filter_height,
+                                                const int filter_width,
+                                                const int stride_height,
+                                                const int stride_width,
+                                                const int padding_height,
+                                                const int padding_width,
+                                                const int dilate_height,
+                                                const int dilate_width,
+                                                T* filter_grad_data) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
   int w_stride = stride_width;
@@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp(
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     } else {
       KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     } else {
-      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+      KernelDepthwiseConvFilterGradCFilterNHWC<T,
+                                               c_filter,
                                                fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     }
   }
@@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp(
  * height and width, respectively.
  */
 template <class T, bool fuse_relu_before_conv>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
-                           fuse_relu_before_conv> {
+class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
                   const DataLayout data_layout = DataLayout::kNCHW) {
     const int batch_size = input.dims()[0];
     const int input_channels =
@@ -905,12 +1199,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 
     framework::Tensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
-                                       filter.dims()[0], filter.dims()[1]});
+      framework::DDim filter_hwc_dims({filter.dims()[2],
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
           ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
       threads = dim3(std::min(output_channels, thread), blocks, 1);
       grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
     int nums_output =
@@ -952,37 +1249,73 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 #endif
     int grid_size = (nums_output + block_size - 1) / block_size;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
-  if (c_filter_multiplier == 0 ||                                              \
-      filter_multiplier == c_filter_multiplier &&                              \
-          stride_height == stride_width && stride_height == c_stride &&        \
-          (ksize_height == ksize_width && ksize_height == c_filter ||          \
-           c_filter == -1)) {                                                  \
-    if (c_filter == -1) {                                                      \
-      threads.x = block_size;                                                  \
-      grid.x = grid_size;                                                      \
-      threads.y = threads.z = grid.y = grid.z = 1;                             \
-    }                                                                          \
-    if (data_layout != DataLayout::kNHWC) {                                    \
-      KernelDepthwiseConvSp<                                                   \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          input_data, filter_data, batch_size, output_channels, output_height, \
-          output_width, input_channels, input_height, input_width,             \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
-          stride_width, padding_height, padding_width, dilate_height,          \
-          dilate_width, output_data);                                          \
-    } else {                                                                   \
-      KernelDepthwiseConvSp<                                                   \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          input_data, filter_data, batch_size, output_channels, output_height, \
-          output_width, input_channels, input_height, input_width,             \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
-          stride_width, padding_height, padding_width, dilate_height,          \
-          dilate_width, output_data);                                          \
-    }                                                                          \
-    return;                                                                    \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
+  if (c_filter_multiplier == 0 ||                                         \
+      filter_multiplier == c_filter_multiplier &&                         \
+          stride_height == stride_width && stride_height == c_stride &&   \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
+           c_filter == -1)) {                                             \
+    if (c_filter == -1) {                                                 \
+      threads.x = block_size;                                             \
+      grid.x = grid_size;                                                 \
+      threads.y = threads.z = grid.y = grid.z = 1;                        \
+    }                                                                     \
+    if (data_layout != DataLayout::kNHWC) {                               \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    } else {                                                              \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    }                                                                     \
+    return;                                                               \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 };
 
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
-                                    fuse_relu_before_conv> {
+class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const framework::Tensor& output_grad,
@@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 
     framework::Tensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
-                                       filter.dims()[0], filter.dims()[1]});
+      framework::DDim filter_hwc_dims({filter.dims()[2],
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
           ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
       threads = dim3(std::min(input_channels, thread), blocks, 1);
       grid = dim3((input_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
 
@@ -1090,22 +1425,60 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
            c_filter == -1)) {                                             \
     if (data_layout != DataLayout::kNHWC) {                               \
       KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
-          output_channels, output_height, output_width, input_channels,   \
-          input_height, input_width, filter_multiplier, ksize_height,     \
-          ksize_width, stride_height, stride_width, padding_height,       \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          input_data,                                                     \
+          output_grad_data,                                               \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
     } else {                                                              \
       KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
-          output_channels, output_height, output_width, input_channels,   \
-          input_height, input_width, filter_multiplier, ksize_height,     \
-          ksize_width, stride_height, stride_width, padding_height,       \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          input_data,                                                     \
+          output_grad_data,                                               \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
     }                                                                     \
     return;                                                               \
   }
@@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 };
 
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
+class DepthwiseConvFilterGradFunctor<phi::GPUContext,
+                                     T,
                                      fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output_grad,
                   const std::vector<int>& strides,
@@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
           std::max(block_size / output_channels, 1),
           ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
       grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
       threads = dim3(std::min(output_channels, block_size), blocks, 1);
     }
     int filter_multiplier = output_channels / input_channels;
@@ -1200,22 +1575,41 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
            c_filter == -1)) {                                                  \
     if (data_layout != DataLayout::kNHWC) {                                    \
       KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNCHW,                                                   \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
-          output_height, output_width, input_channels, input_height,           \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
-          stride_height, stride_width, padding_height, padding_width,          \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_grad_data,                                                    \
+          input_data,                                                          \
+          batch_size,                                                          \
+          output_channels,                                                     \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
     } else {                                                                   \
       framework::Tensor filter_grad_hwc;                                       \
       if (c_filter != -1) {                                                    \
-        framework::DDim filter_grad_hwc_dims(                                  \
-            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
-             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+        framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2],          \
+                                              filter_grad->dims()[3],          \
+                                              filter_grad->dims()[0],          \
+                                              filter_grad->dims()[1]});        \
         filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
         filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
-        phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;      \
+        phi::funcs::SetConstant<phi::GPUContext, T> set_zero;                  \
         set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
         filter_grad_data = filter_grad_hwc.data<T>();                          \
       } else {                                                                 \
@@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
         threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
       }                                                                        \
       KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNHWC,                                                   \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
-          output_height, output_width, input_channels, input_height,           \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
-          stride_height, stride_width, padding_height, padding_width,          \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_grad_data,                                                    \
+          input_data,                                                          \
+          batch_size,                                                          \
+          output_channels,                                                     \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
-        phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;     \
+        phi::funcs::TransposeNormal<phi::GPUContext, T> trans;                 \
         trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
       }                                                                        \
     }                                                                          \
@@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
   }
 };
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             false>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, false>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
 
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             true>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
 
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, true>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f27b6fde99ffa652c734d363ad7731f75b495f4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides_t,
+                             const std::vector<int>& paddings_t,
+                             const std::string& padding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations_t,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  const DenseTensor* output_grad = &out_grad;
+
+  if (!input_grad && !filter_grad) return;
+
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, input_grad, static_cast<T>(0));
+
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, true>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, false>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    }
+  }
+
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, false>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c50ceae33fc790a763e02bed9b6bed4879b2c547
--- /dev/null
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/operators/conv_op.h"
+
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  DenseTensor* output = out;
+  output->mutable_data<T>(dev_ctx.GetPlace());
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> paddings = paddings_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  if (channel_last) {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[output->dims().size() - 1] %
+            input.dims()[input.dims().size() - 1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[output->dims().size() - 1],
+            input.dims()[input.dims().size() - 1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input.dims()[1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[1],
+            input.dims()[1]));
+  }
+
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+
+  if (fuse_relu) {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, true>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  } else {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, false>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c458f8cce3e0a1f9e74f018fe112555d6c0689e0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..87e75e02754a8a39b617a0fdb28ffd8aa8c97ce3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#ifdef PADDLE_WITH_HIP
+// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
+// do not support double in HIPCC platform (Eigen3 to be fixed)
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {}
+#else
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94d4942a41878f06fbe7a0ffa6e8cc2c3f42f159
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto size = x_grad->numel();
+  paddle::operators::DropoutGradGPUKernelDriver<T>(
+      dev_ctx, mode, p, out_grad, mask, size, x_grad, is_test);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd1683ad0c7d8c9a91721a15ae0c24046aab414e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+  bool upscale_in_train = (mode == "upscale_in_train");
+  mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+
+  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                                 is_test,
+                                                 mode,
+                                                 dropout_prob,
+                                                 upscale_in_train,
+                                                 fix_seed,
+                                                 seed,
+                                                 x,
+                                                 seed_tensor.get_ptr(),
+                                                 mask,
+                                                 out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fdf61dc73991d84d4b38ddd214e1abf80cb2798e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ff3b371b6a01a5f311f13d839a0d9abef9f5b68
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5432b5f9187ca25fb6ee48ee44101eb244d4ec6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -0,0 +1,400 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+
+template <typename T>
+void ReduceWrapper(const GPUContext &dev_ctx,
+                   int axis,
+                   DenseTensor *src,
+                   DenseTensor *dst) {
+  std::vector<int> reduce_dims =
+      funcs::GetReduceDim(dst->dims(), src->dims(), axis);
+  funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims);
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXAndYOut(const GPUContext &dev_ctx,
+                     const Place &place,
+                     int axis,
+                     std::vector<const DenseTensor *> ins,
+                     const DenseTensor &dout,
+                     DenseTensor *dx,
+                     DenseTensor *dy,
+                     Functor func) {
+  DenseTensor tmp_dx;
+  DenseTensor tmp_dy;
+  dev_ctx.Alloc<T>(dx);
+  dev_ctx.Alloc<T>(dy);
+  std::vector<DenseTensor *> outs;
+  if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) {
+    outs = {dx, dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, dy};
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    outs = {dx, &tmp_dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, &tmp_dy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T, decltype(func), 2>(
+      dev_ctx, ins, &outs, axis, func);
+
+  if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXOrYOut(const GPUContext &dev_ctx,
+                    const Place &place,
+                    int axis,
+                    std::vector<const DenseTensor *> ins,
+                    const DenseTensor &dout,
+                    DenseTensor *dxy,
+                    Functor func) {
+  DenseTensor tmp_dxy;
+  dev_ctx.Alloc<T>(dxy);
+
+  std::vector<DenseTensor *> outs;
+  if (dxy->dims() != dout.dims()) {
+    tmp_dxy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dxy);
+    outs = {&tmp_dxy};
+  } else {
+    outs = {dxy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  if (dxy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
+  }
+}
+
+/*
+******************************
+    Add Grad
+******************************
+*/
+
+template <typename T>
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
+  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  int stride = GRID_NUM_X * BLOCK_NUM_X;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
+  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
+  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
+
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
+  }
+}
+
+template <typename T>
+void DefaultElementwiseAddGrad(const GPUContext &ctx,
+                               const DenseTensor &x,
+                               const DenseTensor &y,
+                               const DenseTensor &out,
+                               const DenseTensor &dout,
+                               DenseTensor *dx,
+                               DenseTensor *dy,
+                               int axis = -1) {
+  auto *dout_data = dout.data<T>();
+
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims);
+    }
+  }
+}
+
+template <typename T>
+void ElementwiseAddGrad(const GPUContext &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy) {
+  ctx.template Alloc<T>(dx);
+  ctx.template Alloc<T>(dy);
+  auto *dx_data = dx->data<T>();
+  auto *dy_data = dy->data<T>();
+  auto *dout_data = dout.data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x.numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
+                 PREDEFINED_BLOCK_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        dout.data<T>(),
+        size,
+        vec_size,
+        dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
+}
+
+/*
+******************************
+    Sub Grad
+******************************
+*/
+
+template <typename T>
+static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
+                                                       int64_t size,
+                                                       T *dx,
+                                                       T *dy) {
+  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+
+  while (col < size) {
+    if (dx != nullptr) {
+      dx[col] = dout[col];
+    }
+    dy[col] = -dout[col];
+    col += BLOCK_NUM_X * GRID_NUM_X;
+  }
+}
+
+template <typename T>
+void default_elementwise_sub_grad(const GPUContext &ctx,
+                                  const DenseTensor &x,
+                                  const DenseTensor &y,
+                                  const DenseTensor &out,
+                                  const DenseTensor &dout,
+                                  DenseTensor *dx,
+                                  DenseTensor *dy,
+                                  int axis = -1) {
+  auto *dout_data = dout.data<T>();
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+        auto size = dy->numel();
+        dim3 grid_size =
+            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+        SimpleElemwiseSubGradCUDAKernel<
+            T><<<grid_size, block_size, 0, ctx.stream()>>>(
+            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims);
+    }
+  }
+}
+
+template <typename T>
+void elementwise_sub_grad(const GPUContext &ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &y,
+                          const DenseTensor &out,
+                          const DenseTensor &dout,
+                          DenseTensor *dx,
+                          DenseTensor *dy) {
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+  auto size = x.numel();
+  dim3 grid_size =
+      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+  SimpleElemwiseSubGradCUDAKernel<
+      T><<<grid_size, block_size, 0, ctx.stream()>>>(
+      dout.data<T>(),
+      size,
+      dx->mutable_data<T>(ctx.GetPlace()),
+      dy->mutable_data<T>(ctx.GetPlace()));
+}
+/*
+******************************
+    Div Grad
+******************************
+*/
+template <typename T>
+void ElementwiseDivGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis = -1) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+/*
+******************************
+    Mul Grad
+******************************
+*/
+
+template <typename T>
+void ElementwiseMulGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis) {
+  const auto place = dev_ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y, &x};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MultiplyGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor<T>());
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 3c4c01b1dc8ff739ac87ca2e9fe7a6659ab4eac3..81f7fac10880325e152f37b5d4ab783ae93a279c 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -15,10 +15,13 @@
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
 namespace phi {
@@ -33,9 +36,9 @@ void AddGradFunc(const GPUContext& dev_ctx,
                  DenseTensor* dy,
                  int axis = -1) {
   if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
   } else {
-    default_elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
   }
 }
 
@@ -58,15 +61,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx,
-                            y,
-                            ddx,
-                            ddy,
-                            dout,
-                            axis,
-                            ddout,
-                            ElementwiseCompute<funcs::AddFunctor<T>, T>,
-                            ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -106,15 +101,51 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(
-      dev_ctx,
-      y,
-      ddx,
-      ddy,
-      dout,
-      axis,
-      ddout,
-      ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
 }
 
 }  // namespace phi
@@ -183,3 +214,71 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/erf_grad_kernel.cu b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a06863b0a877714bc241d714429a8114f2b0d6f7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/erf_kernel.cu b/paddle/phi/kernels/gpu/erf_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e741be3345fc1d487e1a4148d96f0430f5978f0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, GPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..273851cfd8b34917d8bea6eeafa8a70fe2ae5ba2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0972eebeabf1832d061160d5910d10292f2638ec
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 1f756bfdbed30a18aebfabfb0810436406c87204..852d209ee018598285b3dff35ffd51e289f07976 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -63,18 +63,30 @@ void FullLikeKernel(const Context& dev_ctx,
   auto value = val.to<float>();
   using CommonType = typename std::common_type<
       float,
-      typename std::conditional<std::is_same<T, phi::dtype::float16>::value,
-                                float,
-                                T>::type>::type;
+      typename std::conditional<
+          std::is_same<T, phi::dtype::float16>::value ||
+              std::is_same<T, phi::dtype::bfloat16>::value,
+          float,
+          T>::type>::type;
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
@@ -110,6 +122,7 @@ PD_REGISTER_KERNEL(full,
                    int64_t,
                    bool,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -123,6 +136,7 @@ PD_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
diff --git a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5273902804a200bdf36e8e36748639758145e742
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto &place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s].",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterNdAdd<T, int>(ctx, out_grad, index, x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUScatterNdAdd<T, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..33745ef5f07e82387edfabf89d527b5b698fbabe
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_nd_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUGatherNd<T, int>(ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUGatherNd<T, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a9e73ec37c8ed5f064144e27b06ac6304f5694b3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GatherTree(const T *ids_data,
+                           const T *parents_data,
+                           T *out_data,
+                           const int64_t max_length,
+                           const int64_t batch_size,
+                           const int64_t beam_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
+    int batch = i / beam_size;
+    int beam = i % beam_size;
+    auto idx =
+        (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
+    out_data[idx] = ids_data[idx];
+    auto parent = parents_data[idx];
+    for (int step = max_length - 2; step >= 0; step--) {
+      idx = step * batch_size * beam_size + batch * beam_size;
+      out_data[idx + beam] = ids_data[idx + parent];
+      parent = parents_data[idx + parent];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  PADDLE_ENFORCE_NOT_NULL(ids_data,
+                          phi::errors::InvalidArgument(
+                              "Input(Ids) of gather_tree should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      parents_data,
+      phi::errors::InvalidArgument(
+          "Input(Parents) of gather_tree should not be null."));
+
+  auto &ids_dims = ids.dims();
+  int64_t max_length = ids_dims[0];
+  int64_t batch_size = ids_dims[1];
+  int64_t beam_size = ids_dims[2];
+
+  const int block = 512;
+  int max_threads =
+      std::min(static_cast<int64_t>(dev_ctx.GetMaxPhysicalThreadCount()),
+               batch_size * beam_size);
+  const int grid = std::max(max_threads / block, 1);
+  GatherTree<<<grid, block>>>(
+      ids_data, parents_data, out_data, max_length, batch_size, beam_size);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gather_tree, GPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e2fe2190c1ce0cebea1326e70ce8572ff9bc4d88
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
+    MT out = dist(rng);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+      funcs::normal_distribution<MT> dist;
+      funcs::normal_transform<MT> trans(static_cast<MT>(mean),
+                                        static_cast<MT>(std));
+      funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func = GaussianGenerator<T>(static_cast<T>(mean),
+                                       static_cast<T>(std),
+                                       seed_offset.first,
+                                       gen_offset);
+      IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+    }
+  } else {
+    auto func =
+        GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eab521170bc5d8dde3c89102a45e84bd2f73974
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+struct GraphSendRecvSumCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMaxCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMinCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+__global__ void GraphSendRecvCUDAKernel(const T* params,
+                                        const IndexT* src_indices,
+                                        const IndexT* dst_indices,
+                                        T* output,
+                                        size_t index_size,
+                                        size_t slice_size,
+                                        Functor functor) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    functor(params, output, in_i, out_i);
+  }
+}
+
+// For max
+template <typename T>
+__global__ void InputResetMaxCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::min()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// For min
+template <typename T>
+__global__ void InputResetMinCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::max()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// Get dst_count
+template <typename T, typename IndexT>
+__global__ void ComputeCountCUDAKernel(int32_t* count,
+                                       const IndexT* dst_indices,
+                                       size_t index_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
+    IndexT dst_i = dst_indices[i];
+    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
+  }
+}
+
+// For forward mean
+template <typename T>
+__global__ void ManipulateMeanCUDAKernel(T* output,
+                                         int32_t* count,
+                                         size_t input_size,
+                                         size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    int64_t c_index = i / slice_size;
+    if (*(count + c_index) > 1) {
+      *(output + i) = *(output + i) / *(count + c_index);
+    }
+  }
+}
+
+// For backward mean
+template <typename T, typename IndexT>
+__global__ void ManipulateMeanGradCUDAKernel(const T* params,
+                                             const IndexT* src_indices,
+                                             const IndexT* dst_indices,
+                                             T* output,
+                                             size_t index_size,
+                                             size_t slice_size,
+                                             const int32_t* dst_count) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(output + out_i,
+                                    *(params + in_i) / dst_count[src_i]);
+  }
+}
+
+// For backward min and max
+template <typename T, typename IndexT>
+__global__ void ManipulateMinMaxGradCUDAKernel(const T* params,
+                                               const IndexT* src_indices,
+                                               const IndexT* dst_indices,
+                                               T* output,
+                                               size_t index_size,
+                                               size_t slice_size,
+                                               const T* ptr_input,
+                                               const T* ptr_output) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(
+        output + out_i,
+        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75692966b4662c12feaefdf5bfc7311ed9d628b8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpCUDAKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+
+#ifdef PADDLE_WITH_HIP
+  hipMemset(p_output, 0, memset_bytes);
+#else
+  cudaMemset(p_output, 0, memset_bytes);
+#endif
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = out_grad.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MEAN") {
+    const int32_t* s_count = dst_count->data<int32_t>();
+    ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
+  } else if (pool_type == "MAX" || pool_type == "MIN") {
+    const T* ptr_input = x->data<T>();
+    const T* ptr_output = out->data<T>();
+    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src,
+        d_index,
+        s_index,
+        p_output,
+        index_size,
+        slice_size,
+        ptr_input,
+        ptr_output);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fab306f831a6f401b858b0dfe11011463827a50b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
+                                           const DenseTensor& x,
+                                           const DenseTensor& src_index,
+                                           const DenseTensor& dst_index,
+                                           const std::string& pool_type,
+                                           DenseTensor* out,
+                                           DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  if (pool_type == "SUM" || pool_type == "MEAN") {
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_output, 0, memset_bytes);
+#else
+    cudaMemset(p_output, 0, memset_bytes);
+#endif
+  } else if (pool_type == "MAX") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::min());
+  } else if (pool_type == "MIN") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::max());
+  }
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = x.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMaxCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_max =
+        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
+    InputResetMaxCUDAKernel<T><<<grid_max, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMinCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_min =
+        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
+    InputResetMinCUDAKernel<T><<<grid_min, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MEAN") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    ctx.template Alloc<int32_t>(dst_count);
+    int32_t* p_dst_count = dst_count->data<int32_t>();
+
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#else
+    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
+#endif
+
+    int64_t grid_count = (index_size + block - 1) / block;
+    ComputeCountCUDAKernel<T, IndexT><<<grid_count, block, 0, ctx.stream()>>>(
+        p_dst_count, d_index, index_size);
+
+    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_mean =
+        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
+    ManipulateMeanCUDAKernel<T><<<grid_mean, block, 0, ctx.stream()>>>(
+        p_output, p_dst_count, input_size, slice_size);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8b1ef964124d7d61004ba4cb9f3c53f7c5cec347
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace {
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+}
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+};
+
+template <typename T, typename IndexT = int>
+__global__ void IndexSampleGrad(const IndexT* index,
+                                T* in_grad,
+                                const T* out_grad,
+                                size_t index_length,
+                                size_t input_length,
+                                size_t batch_size,
+                                bool same_data_in_row = true) {
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    index_i = blockDim.x * blockIdx.x + threadIdx.x;
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      if (same_data_in_row) {
+        paddle::platform::CudaAtomicAdd(
+            &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]);
+      } else {
+        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* x_grad) {
+  const T* output_grad_data = out_grad.data<T>();
+  T* input_grad_data = ctx.template Alloc<T>(x_grad);
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  auto input_num = x.numel();
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  size_t batch_size = index_dim[0];
+  size_t input_length = input_dim[1];
+  size_t index_length = index_dim[1];
+  bool same_data_in_index_row = index_length == 1 ? false : true;
+
+  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
+  auto block_height =
+      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      block_width;
+  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
+  dim3 block_dim(block_width, block_height);
+  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
+                (batch_size + block_dim.y - 1) / block_dim.y);
+  LimitGridDim(ctx, &grid_dim);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, x_grad, static_cast<T>(0));
+
+  if (index_type == DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+        index_data,
+        input_grad_data,
+        output_grad_data,
+        index_length,
+        input_length,
+        batch_size,
+        same_data_in_index_row);
+  } else if (index_type == DataType::INT32) {
+    const int* index_data = index.data<int>();
+    IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
+        index_data,
+        input_grad_data,
+        output_grad_data,
+        index_length,
+        input_length,
+        batch_size,
+        same_data_in_index_row);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e042089e1e3d0a20bf3811de3633f5fea0584fa
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace {
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+}
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+}
+
+template <typename T, typename IndexT = int>
+__global__ void IndexSampleForward(const IndexT* index,
+                                   const T* in_data,
+                                   T* out_data,
+                                   size_t index_length,
+                                   size_t input_length,
+                                   size_t batch_size) {
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    index_i = blockDim.x * blockIdx.x + threadIdx.x;
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       DenseTensor* out) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  const T* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  size_t batch_size = input_dim[0];
+  size_t input_length = input_dim[1];
+  size_t index_length = index_dim[1];
+
+  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
+  int block_height =
+      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      block_width;
+  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
+  dim3 block_dim(block_width, block_height);
+  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
+                (batch_size + block_dim.y - 1) / block_dim.y);
+  LimitGridDim(ctx, &grid_dim);
+
+  if (index_type == DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+        index_data, in_data, out_data, index_length, input_length, batch_size);
+  } else if (index_type == DataType::INT32) {
+    const int* index_data = index.data<int>();
+    IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
+        index_data, in_data, out_data, index_length, input_length, batch_size);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b41ed1e55d39b537d89aa882ae10df1decbdd14
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a6ff365c11db8fa4940cacb5fc75c5ebe50ebbb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void LinspaceKernelInner(
+    T start, T stop, double step, int64_t size, T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
+  }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = static_cast<T>(start);
+}
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  DenseTensor n_start;
+  DenseTensor n_stop;
+  DenseTensor n_num;
+  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
+  T start_data = n_start.data<T>()[0];
+  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
+  T stop_data = n_stop.data<T>()[0];
+  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
+  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  double step = 0;
+  auto stream = ctx.stream();
+  int block = 512;
+  int grid = (num + block - 1) / block;
+  if (num != 1) {
+    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    LinspaceKernelInner<T><<<grid, block, 0, stream>>>(
+        start_data, stop_data, step, num, out_data);
+  } else {
+    LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start_data, out_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3bb256ad0326fdf1532fb3bf2162e0e64daba480
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0934520ea4ad14ce9450125b5fae7130140efa28
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1c0bafc932ee87756529325dedbc1394340e7dde
--- /dev/null
+++ b/paddle/phi/kernels/gpu/logical_kernel.cu
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                               \
+  template <typename T, typename Context>                                \
+  void Logical##type##Kernel(const Context& dev_ctx,                     \
+                             const DenseTensor& x,                       \
+                             const DenseTensor& y,                       \
+                             DenseTensor* out) {                         \
+    using InT = typename funcs::Logical##type##Functor<T>::ELEMENT_TYPE; \
+    using OutT = bool;                                                   \
+    dev_ctx.template Alloc<bool>(out);                                   \
+    funcs::Logical##type##Functor<T> binary_func;                        \
+    std::vector<const DenseTensor*> ins = {&x, &y};                      \
+    std::vector<DenseTensor*> outs = {out};                              \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, InT, OutT>(         \
+        dev_ctx, ins, &outs, -1, binary_func);                           \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  using InT = typename funcs::LogicalNotFunctor<T>::ELEMENT_TYPE;
+  using OutT = bool;
+
+  dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BroadcastKernel<ElementwiseType::kUnary, InT, OutT>(
+      dev_ctx, ins, &outs, -1, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                            \
+                     GPU,                                    \
+                     ALL_LAYOUT,                             \
+                     phi::Logical##func_type##Kernel,        \
+                     float,                                  \
+                     double,                                 \
+                     bool,                                   \
+                     int64_t,                                \
+                     int,                                    \
+                     int8_t,                                 \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor)
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index 56e8b16ccbe0df16fdc96470a8167e6dc6abfb3c..af9d5574aa9feaf4d44482bbf0e75f31a5139595 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/math_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 
 #ifdef __NVCC__
@@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw,
                    float,
                    double,
                    float16,
+                   bfloat16,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25a9de8f8bed42121bcc6d52f4d2b2da565e4dfa
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d7ae7d8a3f745c001c7ca82a1baffc8c1f6291c3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..86ff09fd74b065bed765f51c0f5019350ab04636
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..88776a49f19b2bf36d4dd4c5f6c610ce69a7f40a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6761d945e952eb0a4f6498f484a75766e913d74d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(multi_dot_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiDotGradKernel,
+                   float,
+                   double,
+                   float16) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60b1fce5ddd8905b15bcb3023e1512aedcf32585
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(
+    multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {}
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/phi/kernels/gpu/nll_loss.h
similarity index 50%
rename from paddle/fluid/operators/nll_loss_op.cu
rename to paddle/phi/kernels/gpu/nll_loss.h
index fd8a44cc05d7c89ec887ddc08c4f504a7629b5e2..a457264498feb2a9b3a1107ff086156a39f7fe4a 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -1,37 +1,39 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <thrust/functional.h>
 #include <algorithm>
 #include <functional>
 #include <string>
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
+namespace phi {
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 static const int NTHREADS = 32;
-
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                   kNumMaxinumNumBlocks);
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
+__global__ void GPUNLLLossForward1D_no_reduce(T* out_data,
+                                              const T* x_data,
                                               const int64_t* label_data,
                                               const T* weight_data,
                                               const int64_t batch_size,
@@ -51,11 +53,15 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossForward1D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t size_average,
+                                                const int64_t ignore_index) {
   __shared__ T sharedInputs[NTHREADS], sharedWeights[NTHREADS];
   sharedInputs[threadIdx.x] = 0;
   sharedWeights[threadIdx.x] = 0;
@@ -99,9 +105,11 @@ __global__ void GPUNLLLossForward1D_with_reduce(
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp, int N>
-__device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
+__device__ void reduceNValuesInBlock(T* smem,
+                                     T threadVals[N],
                                      const unsigned int numVals,
-                                     ReduceOp reduceOp, T init) {
+                                     ReduceOp reduceOp,
+                                     T init) {
   if (numVals == 0) {
 #pragma unroll
     for (int i = 0; i < N; ++i) {
@@ -175,18 +183,26 @@ __device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp>
-__device__ T reduceBlock(T* smem, const unsigned int numVals, T threadVal,
-                         ReduceOp reduceOp, T init) {
-  reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp,
-                                       init);
+__device__ T reduceBlock(T* smem,
+                         const unsigned int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  reduceNValuesInBlock<T, ReduceOp, 1>(
+      smem, &threadVal, numVals, reduceOp, init);
   return threadVal;
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_no_reduce(
-    T* out_data, const T* x_data, const int64_t* label_data,
-    const T* weight_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_no_reduce(T* out_data,
+                                              const T* x_data,
+                                              const int64_t* label_data,
+                                              const T* weight_data,
+                                              const int64_t batch_size,
+                                              const int64_t n_classes,
+                                              const int64_t in_dim2,
+                                              const int64_t in_dim3,
+                                              const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -211,11 +227,16 @@ __global__ void GPUNLLLossForward2D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t map_nelem,
+                                                const int64_t blocks_per_sample,
+                                                const int64_t ignore_index) {
   __shared__ T partial_sums[kNumCUDAThreads];
   int64_t i;
   T input_sum = 0;
@@ -228,7 +249,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   int64_t ioffset = sample * map_nelem * n_classes;
   int64_t step = blockDim.x * blocks_per_sample;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
@@ -242,8 +264,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   input_sum =
       reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<T>(), (T)0);
   __syncthreads();
-  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight,
-                           thrust::plus<T>(), (T)0);
+  acc_weight = reduceBlock(
+      partial_sums, blockDim.x, acc_weight, thrust::plus<T>(), (T)0);
 
   if (threadIdx.x == 0) {
     paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight);
@@ -258,12 +280,14 @@ __global__ void GPUNLLLossForward2D_size_average(T* out_data,
     *out_data /= *total_weight_data;
   }
 }
-
 template <typename T>
-__global__ void GPUNLLLossBackward1D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t ignore_index) {
   CUDA_KERNEL_LOOP(i, batch_size) {
     const int64_t cur_label = label_data[i];
     if (cur_label == ignore_index) {
@@ -275,11 +299,15 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward1D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_with_reduce(T* dx_data,
+                                                 const T* total_weight_data,
+                                                 const int64_t* label_data,
+                                                 const T* weight_data,
+                                                 const T* dout_data,
+                                                 const int64_t batch_size,
+                                                 const int64_t n_classes,
+                                                 const int64_t size_average,
+                                                 const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
   }
@@ -295,10 +323,15 @@ __global__ void GPUNLLLossBackward1D_with_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward2D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward2D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t in_dim2,
+                                               const int64_t in_dim3,
+                                               const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -319,10 +352,16 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
 
 template <typename T>
 __global__ void GPUNLLLossBackward2D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t size_average,
+    T* dx_data,
+    const T* total_weight_data,
+    const int64_t* label_data,
+    const T* weight_data,
+    const T* dout_data,
+    const int64_t batch_size,
+    const int64_t n_classes,
+    const int64_t map_nelem,
+    const int64_t blocks_per_sample,
+    const int64_t size_average,
     const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
@@ -334,7 +373,8 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   int toffset = sample * map_nelem;
   int ioffset = sample * map_nelem * n_classes;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       dx_data[ioffset + i + map_nelem * cur_label] =
@@ -343,158 +383,4 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   }
 }
 
-template <typename DeviceContext, typename T>
-class NLLLossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(total_weight_data, 0, sizeof(T));
-#else
-    cudaMemset(total_weight_data, 0, sizeof(T));
-#endif
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-    int64_t size_average = (int64_t)(reduction == "mean");
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossForward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossForward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, map_size, blocks_per_sample, ignore_index);
-        if (size_average) {
-          GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
-              out_data, total_weight_data);
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NLLLossGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-#ifdef PADDLE_WITH_HIP
-    hipMemset(dx_data, 0, dx->numel() * sizeof(T));
-#else
-    cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
-#endif
-
-    int64_t size_average = (int64_t)(reduction == "mean");
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossBackward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossBackward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, map_size, blocks_per_sample, size_average,
-            ignore_index);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9a2d9c6e479aa43b64b4d34bd2adc41da94a87ef
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& dout,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = dout.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+#ifdef PADDLE_WITH_HIP
+  hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#else
+  cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
+#endif
+
+  int64_t size_average = (int64_t)(reduction == "mean");
+  auto x_dims = x.dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossBackward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          dx_data,
+          total_weight_data,
+          label_data,
+          weight_data,
+          dout_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossBackward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                             total_weight_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             dout_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             size_average,
+                                                             ignore_index);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, GPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b0e1fef7ba9a5f1814ba996237d3fdc08882d3f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto* x = &input;
+  auto x_data = x->data<T>();
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  auto label_data = label.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(total_weight_data, 0, sizeof(T));
+#else
+  cudaMemset(total_weight_data, 0, sizeof(T));
+#endif
+  auto x_dims = x->dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+  int64_t size_average = (int64_t)(reduction == "mean");
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossForward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          out_data,
+          total_weight_data,
+          x_data,
+          label_data,
+          weight_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossForward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                             total_weight_data,
+                                                             x_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             ignore_index);
+      if (size_average) {
+        GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
+            out_data, total_weight_data);
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, GPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index ab38a82eceb1e73bddbe07a37d72cab99929852c..43a08b0603e652ab78ba6c28a8b57d238f878915 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -75,9 +75,9 @@ __global__ void NormalizeGradient(const T* x,
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& norm,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a25472d122b837fcc3928af15e0678f0362abf0c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b77a5f1aeb6cb3f24f274beb6939c480022fe49
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+#include "paddle/phi/kernels/pad_kernel.h"
+
+PD_REGISTER_KERNEL(pad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6745653eba7d175447eb54c319919fd6d87fb5dd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(const int nthreads,
+                                     const T* input_rois,
+                                     const T* dout_data,
+                                     const float spatial_scale,
+                                     const int input_channels,
+                                     const int height,
+                                     const int width,
+                                     const int output_channels,
+                                     const int pooled_height,
+                                     const int pooled_width,
+                                     const int* rois_batch_id_data,
+                                     T* dx_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_dx_data = dx_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  int rois_num_t = rois.dims()[0];
+  int input_channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (dx) {
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      paddle::memory::Copy(CPUPlace(),
+                           rois_num_list.data(),
+                           ctx.GetPlace(),
+                           rois_num->data<int>(),
+                           sizeof(int) * rois_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_list[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    DenseTensor rois_batch_id_list_gpu;
+    Copy(ctx,
+         rois_batch_id_list,
+         ctx.GetPlace(),
+         false,
+         &rois_batch_id_list_gpu);
+
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    int dout_size = dout.numel();
+    int blocks = NumBlocks(dout_size);
+    int threads = kNumCUDAThreads;
+
+    if (dout_size > 0) {
+      GPUPSROIPoolBackward<T><<<blocks, threads, 0, ctx.stream()>>>(
+          dout_size,
+          rois.data<T>(),
+          dout.data<T>(),
+          spatial_scale,
+          input_channels,
+          height,
+          width,
+          output_channels,
+          pooled_height,
+          pooled_width,
+          rois_batch_id_list_gpu.data<int>(),
+          ctx.template Alloc<T>(dx));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, GPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f9be001ba763d323ad93fdfd4cc06e97e266188
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(const int nthreads,
+                                    const T* input_data,
+                                    const T* input_rois,
+                                    const float spatial_scale,
+                                    const int input_channels,
+                                    const int height,
+                                    const int width,
+                                    const int output_channels,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int* rois_batch_id_data,
+                                    T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "The channels %d of input X should equal the product of "
+          "output_channels %d x pooled_height %d x pooled_width %d.",
+          input_channels,
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  int rois_num_t = rois.dims()[0];
+  if (rois_num_t == 0) return;
+  int rois_batch_size;
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(CPUPlace(),
+                         rois_num_list.data(),
+                         ctx.GetPlace(),
+                         rois_num_data,
+                         sizeof(int) * rois_batch_size,
+                         0);
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_list[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_list[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_list[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_t,
+                      rois_num_with_lod,
+                      errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          rois_num_with_lod));
+
+    // set rois batch id
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  DenseTensor rois_batch_id_list_gpu;
+  Copy(ctx, rois_batch_id_list, ctx.GetPlace(), false, &rois_batch_id_list_gpu);
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  // call cuda kernel function
+  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      rois.data<T>(),
+      spatial_scale,
+      input_channels,
+      height,
+      width,
+      output_channels,
+      pooled_height,
+      pooled_width,
+      rois_batch_id_list_gpu.data<int>(),
+      ctx.template Alloc<T>(out));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, GPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f553da361f1fe825a17962ed2ac5c9463b509f6b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int32_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    } else {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_gather_kernel<T, int32_t>(
+          out_grad,
+          axis,
+          index,
+          *value_grad,
+          dev_ctx);  // the gradient of scatter is gather
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d363c0c28364c065117fe53967234484871979af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisCUDAKernel only runs on GPU device."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
index 66dc5f72a5c7067a08127bce65740851b123efd3..d4cbd5c73feae26b04ff6c73f505f1b60a80138d 100644
--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -18,10 +18,13 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
 template <typename T, typename Context>
@@ -32,34 +35,39 @@ void RandintRawKernel(const Context& dev_ctx,
                       DataType dtype,
                       int seed,
                       DenseTensor* out) {
-  DenseTensor tmp;
-  tmp.Resize(phi::make_ddim(shape.GetData()));
-  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-  out->Resize(tmp.dims());
+  out->Resize(phi::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
+  if (FLAGS_use_curand) {
+    funcs::uniform_distribution<uint32_t> dist;
+    funcs::uniform_int_transform<T, uint32_t> trans(low, high);
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
-    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-  }
+    DenseTensor tmp;
+    tmp.Resize(phi::make_ddim(shape.GetData()));
+    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
 
-  std::uniform_int_distribution<T> dist(low, high - 1);
-  auto numel = out->numel();
-  for (int64_t i = 0; i < numel; ++i) {
-    tmp_data[i] = dist(*engine);
-  }
+    std::shared_ptr<std::mt19937_64> engine;
+    if (seed) {
+      engine = std::make_shared<std::mt19937_64>();
+      engine->seed(seed);
+    } else {
+      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+    }
+
+    std::uniform_int_distribution<T> dist(low, high - 1);
+    auto numel = out->numel();
+    for (int64_t i = 0; i < numel; ++i) {
+      tmp_data[i] = dist(*engine);
+    }
 
-  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-      out->place(),
-      data,
-      tmp.place(),
-      tmp_data,
-      numel * paddle::experimental::SizeOf(out->dtype()),
-      0);
+    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+        out->place(),
+        data,
+        tmp.place(),
+        tmp_data,
+        numel * paddle::experimental::SizeOf(out->dtype()),
+        0);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 9223a94c12aeb0912f634d1d5ca8b2c03653e8b9..da5315f34479f92bfb0e5d807e28882eafa3d2ac 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -17,1202 +17,14 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/utils/array.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-
-// Reduce split or not, Whether to use ReduceHigherDim
-#define REDUCE_SPLIT_BOUNDARY 512
-#define REDUCE_VEC_SIZE 4
-
-namespace kps = phi::kps;
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
-namespace kernels {
-
-namespace details {
-
-static inline int GetLastPow2(int n) {
-  n |= (n >> 1);
-  n |= (n >> 2);
-  n |= (n >> 4);
-  n |= (n >> 8);
-  n |= (n >> 16);
-  return std::max(1, n - (n >> 1));
-}
-
-static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
-
-// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
-static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
-                                             const std::vector<int>& idx) {
-  int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[idx[i + 1]];
-  }
-  return strides;
-}
-
-// get blockDim for reduceLastDim and reduceAny
-static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kps::details::kReduceMaxThread
-             ? kps::details::kReduceMaxThread
-             : GetLastPow2(block_dim);
-}
-
-// check reduce rand is valid
-static inline void CheckReduceRank(int reduce_rank, int rank) {
-  if (rank % 2 == 0) {
-    PADDLE_ENFORCE_EQ(reduce_rank,
-                      rank / 2,
-                      phi::errors::InvalidArgument(
-                          "ReduceOp: invalid reduce rank. When rank = %d, "
-                          "reduce_rank must be %d, but got %d.",
-                          rank,
-                          rank / 2,
-                          reduce_rank));
-  } else {
-    auto lower_rank = (rank - 1) / 2;
-    auto upper_rank = (rank + 1) / 2;
-    PADDLE_ENFORCE_EQ(
-        reduce_rank == lower_rank || reduce_rank == upper_rank,
-        true,
-        phi::errors::InvalidArgument(
-            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
-            "must be %d or %d, but got %d.",
-            rank,
-            lower_rank,
-            upper_rank,
-            reduce_rank));
-  }
-}
-
-// convert dims from vector to array
-template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline phi::Array<T, ElementCount> VectorToArray(
-    const VectorLikeType& vec) {
-  PADDLE_ENFORCE_LE(
-      vec.size(),
-      ElementCount,
-      phi::errors::InvalidArgument("Cub reduce Array: size not match. Received "
-                                   "vec.size() %d > ElementCount %d.",
-                                   vec.size(),
-                                   ElementCount));
-  size_t n = static_cast<size_t>(vec.size());
-  phi::Array<T, ElementCount> ret;
-  for (size_t i = 0; i < n; ++i) {
-    ret[i] = vec[i];
-  }
-  return ret;
-}
-
-static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
-                                            int dim_size,
-                                            bool reduce_all) {
-  std::vector<int> reduce_dims;
-  if (reduce_all) {
-    reduce_dims.resize(dim_size);
-    int reduce_size = reduce_dims.size();
-    for (int i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = i;
-    }
-  } else {
-    for (auto e : dims) {
-      PADDLE_ENFORCE_LT(e,
-                        dim_size,
-                        phi::errors::InvalidArgument(
-                            "ReduceOp: invalid axis, when x_dims is %d, "
-                            "axis[i] should less than x_dims, but got %d.",
-                            dim_size,
-                            e));
-      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
-    }
-  }
-  return reduce_dims;
-}
-
-}  // namespace details
-
-constexpr int kMaxRank = phi::DDim::kMaxRank;
-
-enum ReduceType {
-  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
-  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
-  kReduceAny = 0x03,        // when reduce_dim.size() > 1
-};
-
-struct IndexCalculator {
-  IndexCalculator(int dim,
-                  const std::vector<int>& cal_dims,
-                  const std::vector<int>& cal_strides,
-                  const std::vector<int>& full_strides)
-      : dim(dim) {
-    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
-    strides = details::VectorToArray<int, kMaxRank>(full_strides);
-    std::vector<paddle::platform::FastDivMod> cal_divmoders;
-    // fast divmod
-    for (auto i : cal_strides) {
-      cal_divmoders.push_back(paddle::platform::FastDivMod(i));
-    }
-    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
-        cal_divmoders);
-  }
-
-  __device__ inline int operator()(int offset) const {
-    int index = 0;
-#pragma unroll
-    for (int i = 0; i < kMaxRank; ++i) {
-      if (i == dim) {
-        break;
-      }
-      auto divmod = divmoders[i].Divmod(offset);
-      index += (divmod.val[0] * strides[dims[i]]);
-      offset = divmod.val[1];
-    }
-    return index;
-  }
-
-  int dim;
-  phi::Array<int, kMaxRank> dims;
-  phi::Array<int, kMaxRank> strides;
-  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
-};
-
-template <bool ReduceLastDim = false>
-struct ReduceIndexMapping {
-  const kps::DimConfig dim;
-  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
-      : dim(dims) {}
-
-  __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    } else {
-      return cluster_id() % dim.split_num_x;
-    }
-#else
-    return blockIdx.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() % dim.split_num_x);
-    } else {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    }
-#else
-    return blockIdx.y;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_y;
-#else
-    return blockDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_y;
-    } else {
-      return dim.split_num_x;
-    }
-#else
-    return gridDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_x;
-    } else {
-      return dim.split_num_y;
-    }
-#else
-    return gridDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.deal_size_y;
-    } else {
-      return dim.deal_size_x;
-    }
-#else
-    return 1;
-#endif
-  }
-};
-
-// when reduce_type == kReduceLastDim this struct will be used
-// for higher performance
-struct OneDimIndexCal {
-  explicit OneDimIndexCal(int num) : stride(num) {}
-
-  __device__ inline int operator()(int index) const { return index * stride; }
-  int stride;
-};
-
-// reduce config
-template <typename Ty>
-struct ReduceConfig {
-  ReduceConfig(const std::vector<int>& origin_reduce_dims,
-               const std::vector<int>& origin_x_dim)
-      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
-
-  // get the parameters of reduceKernel
-  void Run() {
-    // step1: update the reduce_dim left_dim and x_dim
-    SetReduceDim();
-
-    // step2: get the strides of dim for reduceAny and reduceLastDim
-    SetStrides();
-
-    // step3: get the type of reduce
-    SetReduceType();
-
-    // step4: set the block and grid for launch kernel
-    SetBlockDim();
-  }
-
-  // when should_reduce_again is true, we need malloc temp space for temp data
-  void SetOutputData(Ty* y_data,
-                     const paddle::platform::Place& place,
-                     phi::DenseTensor* tmp) {
-    if (should_reduce_again) {
-      tmp->ResizeAndAllocate(phi::make_ddim(
-          {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
-      output_data = tmp->mutable_data<Ty>(place);
-    } else {
-      output_data = y_data;
-    }
-  }
-
- private:
-  // set reduce_dim, left_dim and update x_dim
-  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
-  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
-  void SetReduceDim() {
-    std::set<int> reduce_set;
-    for (auto e : reduce_dims_origin) {
-      auto pos = e >= 0 ? e : e + x_dim.size();
-      reduce_set.insert(pos);
-    }
-
-    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
-    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
-
-    // update reduce_dim and x_dim
-    std::vector<int> x_new_dim;
-
-    reduce_dim.push_back(reduce_dim_temp[0]);
-    x_new_dim.push_back(x_dim[0]);
-
-    int idx_reduce = 1;
-    int num = 0;
-
-    if (reduce_dim_temp.size() > 1) {
-      for (int i = 1; i < x_dim.size(); i++) {
-        if ((idx_reduce < reduce_dim_temp.size()) &&
-            (i == reduce_dim_temp[idx_reduce])) {
-          int result =
-              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
-          bool is_equal = ((result - num) == 1);
-          if (is_equal) {
-            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-            num++;
-          } else {
-            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
-            x_new_dim.push_back(x_dim[i]);
-          }
-          idx_reduce++;
-        } else {
-          x_new_dim.push_back(x_dim[i]);
-        }
-      }
-    } else {
-      x_new_dim = x_dim;
-    }
-
-    // update x_dim
-    x_dim = x_new_dim;
-    std::vector<int>().swap(x_new_dim);
-
-    std::vector<int> reduce_dim_new;
-    int is_reduced = 0;
-    for (auto e : reduce_dim) {
-      is_reduced |= 1 << e;
-    }
-
-    std::vector<int>().swap(reduce_dim);
-
-    for (int i = 0; i < x_dim.size(); i++) {
-      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-        x_new_dim.push_back(x_dim[i]);
-        if ((is_reduced >> i) & 1)
-          reduce_dim_new.push_back(x_new_dim.size() - 1);
-      } else {
-        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-      }
-    }
-
-    x_dim = x_new_dim;
-    reduce_dim = reduce_dim_new;
-
-    int x_rank = static_cast<int>(x_dim.size());
-    std::set<int> left_set;
-
-    for (int i = 0; i < x_rank; ++i) {
-      left_set.insert(i);
-    }
-
-    for (auto e : reduce_dim) {
-      left_set.erase(e);
-    }
-
-    left_dim.assign(left_set.begin(), left_set.end());
-
-    // if the last dim gets involved in reduction
-    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
-  }
-
-  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
-  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
-  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
-  //     left_strides = [1]
-  void SetStrides() {
-    std::vector<int> idx_dim;
-    for (int i = 0; i < x_dim.size(); i++) {
-      idx_dim.push_back(i);
-    }
-
-    x_strides = details::GetDimStrides(x_dim, idx_dim);
-    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
-    left_strides = details::GetDimStrides(x_dim, left_dim);
-    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
-
-    left_num = 1;
-    if (left_dim.size()) {
-      left_num = left_strides[0] * x_dim[left_dim[0]];
-    }
-  }
-
-  // get the reduceType
-  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
-  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
-  //     x_dim = [8] reduce_dim = [0] --> reduceAll
-  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
-  void SetReduceType() {
-    int rank = x_dim.size();
-    int reduce_rank = reduce_dim.size();
-    bool is_last_dim =
-        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
-    if (rank == reduce_rank || is_last_dim) {
-      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
-    } else if (reduce_rank == 1) {
-// ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU2
-      if (reduce_dim[0] == 0) {
-        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-      } else {
-        reduce_type = static_cast<int>(ReduceType::kReduceAny);
-      }
-#else
-      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-#endif
-    } else {
-      reduce_type = static_cast<int>(ReduceType::kReduceAny);
-    }
-  }
-
-  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
-    constexpr int min_reduce_num_per_thread = 16;
-    constexpr int max_reduce_num_per_thread = 256;
-    constexpr int max_num_threads = kps::details::kReduceMaxThread;
-
-    // set block size.
-    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
-    //    will process the reduction for one output.
-    //    The number of output for one block is blockDim.y;
-    // 2. If reduce_last_dim == false, different threadIdx.x will process
-    //    different reduction and gets the output separately. If it is
-    //    necessary, it should reduce in block y.
-    //    The number of output for one block is blockDim.x;
-    int block_x, block_y;
-    int grid_num, reduce_num_per_thread;
-    if (reduce_last_dim) {
-      block_x = details::GetBlockDim(reduce_num);
-      block_y = details::GetBlockDim(left_num);
-      block_dim->x = block_x;
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      grid_num = details::AlignUp(left_num, block_dim->y);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
-    } else {
-      block_x = details::GetBlockDim(left_num);
-      block_y = details::GetBlockDim(reduce_num);
-      block_dim->x = std::min(block_x, 32);
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      block_dim->x =
-          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
-      grid_num = details::AlignUp(left_num, block_dim->x);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
-    }
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    int num_threads = block_dim->x * block_dim->y;
-    int max_num_blocks = max_threads / num_threads;
-
-    // set grid size.
-    // Whether to set grid.y larger than 1, there are 3 following rules:
-    // 1. The number that each thread process should no less than
-    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
-    // 2. It should maximize the utilization of SM.
-    // So we choose the minimum between input_split_num_1 and input_split_num_3
-    // to make each thread process as mush data as possible. Meanwhile,
-    // the number cannot be larger than max_reduce_num_per_thread, so we
-    // choose the maximum between the result above and input_split_num_2.
-    int input_split_num_1 =
-        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
-    int input_split_num_2 =
-        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
-    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
-
-    grid_dim->x = grid_num;
-    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
-                           input_split_num_2);
-    // if grid.y > 1, we need launch reduce kernel again.
-    if (grid_dim->y > 1) {
-      should_reduce_again = true;
-    }
-  }
-
-  // set block and grid for launch kernel
-  // for ReduceHigherDim: if block is enough -> splite reduce_num
-  //                     else init block(32, 1) grid(block_num, 1)
-  // for others: block(block_num, 1) , grid(left_num, 1)
-  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
-    int last_dim_num = x_dim.back();
-    // update left_num
-    int grid_z = left_num / last_dim_num;
-    left_num = last_dim_num;
-    grid_dim->z = grid_z;
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    // init
-    int num_block = (max_threads / left_num);
-    block_dim->x = details::GetBlockDim(left_num);
-    grid_dim->x = details::AlignUp(left_num, block_dim->x);
-    blocking_size = reduce_num;
-
-    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-      blocking_size = details::GetLastPow2(reduce_num / num_block);
-      if (blocking_size <= 1) {
-        blocking_size = details::GetLastPow2(sqrt(reduce_num));
-      } else if (blocking_size * 2 < reduce_num) {
-        blocking_size *= 2;
-      }
-      should_reduce_again = true;
-      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
-    }
-  }
-
-  void SetBlockDim() {
-    // init
-    int block_num = details::GetBlockDim(reduce_num);
-    should_reduce_again = false;
-    dim3 block_dim(block_num, 1, 1);
-    dim3 grid_dim(left_num, 1, 1);
-    blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU2
-    if (reduce_last_dim) {
-      block_dim.x = 128;
-      block_dim.y = reduce_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    } else {
-      block_dim.x = 128;
-      block_dim.y = left_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    }
-#else
-    if (reduce_type == ReduceType::kReduceHigherDim) {
-      SetBlockDimForHigher(&block_dim, &grid_dim);
-    } else {
-      SetBlockDimForReduceAny(&block_dim, &grid_dim);
-    }
-#endif
-
-    block = block_dim;
-    grid = grid_dim;
-  }
-
- public:
-  std::vector<int> reduce_dims_origin;
-  std::vector<int> reduce_dim;
-  std::vector<int> x_dim;
-  std::vector<int> left_dim;
-  std::vector<int> x_strides;
-  std::vector<int> left_strides;
-  std::vector<int> reduce_strides;
-
-  int reduce_type;
-  int reduce_num;
-  int left_num;
-  int blocking_size;
-  bool should_reduce_again;
-  bool reduce_last_dim;
-
-  Ty* output_data;
-
-  dim3 block;
-  dim3 grid;
-};
-
-// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-// function will be used
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp,
-          typename Calculator>
-__global__ void ReduceAnyKernel(const Tx* x,
-                                Ty* y,
-                                ReduceOp reducer,
-                                TransformOp transformer,
-                                MPType init,
-                                int reduce_num,
-                                int left_num,
-                                bool reduce_last_dim,
-                                const Calculator reduce_index_calculator,
-                                const Calculator left_index_calculator,
-                                const kps::DimConfig dim) {
-  int input_idx, left_idx, stride;
-  int block_size = 0;
-  bool need_store = true;
-  int loop_left = 0;
-  int tid = 0;
-  // the last dim gets involved in reduction
-  int store_offset = 0;
-  int stride_left = 0;
-  if (reduce_last_dim) {
-    auto block = ReduceIndexMapping<true>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimX();
-    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
-    stride = block.GridDimY() * block.BlockDimX();
-    block_size = block.BlockDimX();
-    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = 1;
-    tid = threadIdx.x;
-  } else {
-    auto block = ReduceIndexMapping<false>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimY();
-    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
-    stride = block.GridDimY() * block.BlockDimY();
-    block_size = block.BlockDimY();
-    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = block.BlockDimX() * block.GridDimX();
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = threadIdx.y;
-  }
-  // calculate the offset, means the addr where each thread really start.
-  // 1. reduce for each thread
-  MPType input_compute[REDUCE_VEC_SIZE];
-  Tx input_reg[REDUCE_VEC_SIZE];
-  for (int i = 0; i < loop_left; i += stride_left) {
-    int input_offset = left_index_calculator(left_idx + i);
-    const Tx* input = x + input_offset;
-    MPType reduce_var = init;
-    // load REDUCE_VEC_SIZE data once, and then compute
-    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
-    for (; input_idx + block_size < bound;
-         input_idx += REDUCE_VEC_SIZE * stride) {
-      kps::ReadDataReduce<Tx,
-                          Tx,
-                          1,
-                          REDUCE_VEC_SIZE,
-                          1,
-                          1,
-                          Calculator,
-                          kps::IdentityFunctor<Tx>,
-                          false>(&input_reg[0],
-                                 input,
-                                 input_idx,
-                                 reduce_index_calculator,
-                                 1,
-                                 reduce_num,
-                                 1,
-                                 stride,
-                                 kps::IdentityFunctor<Tx>(),
-                                 reduce_last_dim);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &input_compute[0], &input_reg[0], transformer);
-      kps::Reduce<MPType,
-                  REDUCE_VEC_SIZE,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-    }
-
-    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
-    kps::ReadDataReduce<Tx,
-                        MPType,
-                        1,
-                        REDUCE_VEC_SIZE,
-                        1,
-                        1,
-                        Calculator,
-                        TransformOp,
-                        true>(&input_compute[0],
-                              input,
-                              input_idx,
-                              reduce_index_calculator,
-                              1,
-                              reduce_num - input_idx,
-                              1,
-                              stride,
-                              transformer,
-                              reduce_last_dim);
-    kps::Reduce<MPType,
-                REDUCE_VEC_SIZE,
-                1,
-                1,
-                ReduceOp,
-                kps::details::ReduceMode::kLocalMode>(
-        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-
-    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
-        &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp>
-__global__ void ReduceHigherDimKernel(const Tx* x,
-                                      Ty* y,
-                                      ReduceOp reducer,
-                                      TransformOp transformer,
-                                      MPType init,
-                                      int reduce_num,
-                                      int left_num,
-                                      int blocking_size,
-                                      const kps::DimConfig dim) {
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  auto block = ReduceIndexMapping<false>(dim);
-  int idy = block.BlockIdY() * blocking_size;
-  int idx = block.BlockIdX() * block.BlockDimX();
-  int idz = BLOCK_ID_Z * left_num;
-  int stride = dim.split_num_x * dim.deal_size_x;
-  int size = left_num - dim.rem_x;
-  int loop_size = min(reduce_num - idy, blocking_size);
-  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
-  int block_offset = idy * left_num + idz * reduce_num;
-  const Tx* input = x + block_offset;
-  Tx reduce_input;
-  for (; idx < size; idx += stride) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
-                                            input + loop_idx * left_num + idx,
-                                            block.BlockDimX(),
-                                            1,
-                                            1,
-                                            left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, false>(
-        y + store_offset + idx, &result, block.BlockDimX());
-  }
-
-  if (idx < left_num) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
-                                           input + loop_idx * left_num + idx,
-                                           dim.rem_x,
-                                           1,
-                                           1,
-                                           left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, true>(
-        y + store_offset + idx, &result, dim.rem_x);
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp>
-static void LaunchReduceKernel(const Tx* x_data,
-                               Ty* y_data,
-                               const ReduceOp& reducer,
-                               const TransformOp& transform,
-                               MPType init,
-                               gpuStream_t stream,
-                               ReduceConfig<Ty> config) {
-  if (config.reduce_type == kReduceLastDim) {
-    int stride_reduce = 1;
-    int stride_left = config.reduce_num;
-    // for higher performance
-    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
-    auto left_index_calculator = OneDimIndexCal(stride_left);
-
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.block.y,
-                                        0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 128, stream>>>(x_data,
-                                                        config.output_data,
-                                                        reducer,
-                                                        transform,
-                                                        init,
-                                                        config.reduce_num,
-                                                        config.left_num,
-                                                        config.reduce_last_dim,
-                                                        reduce_index_calculator,
-                                                        left_index_calculator,
-                                                        dim);
-#else
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#endif
-
-  } else {
-    int reduce_rank = config.reduce_strides.size();
-    int left_rank = config.left_strides.size();
-    auto reduce_index_calculator = IndexCalculator(reduce_rank,
-                                                   config.reduce_dim,
-                                                   config.reduce_strides,
-                                                   config.x_strides);
-    auto left_index_calculator = IndexCalculator(
-        left_rank, config.left_dim, config.left_strides, config.x_strides);
-
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.block.y,
-                                        0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 128, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#else
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#endif
-  }
-
-  if (config.should_reduce_again) {
-    dim3 block;
-    dim3 grid;
-    if (config.reduce_last_dim) {
-      block = dim3(32, 1, 1);
-      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
-    } else {
-      block = dim3(config.block.x, 1, 1);
-      grid = dim3(config.grid.x, 1, config.grid.z);
-    }
-
-    auto last_index = OneDimIndexCal(1);
-    auto first_index = OneDimIndexCal(config.left_num);
-    kps::DimConfig dim =
-        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-    dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Ty,
-                          Ty,
-                          MPType,
-                          ReduceOp,
-                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#endif
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
-                               void>::type
-CubTensorReduceImpl(const Tx* x_data,
-                    Ty* y_data,
-                    const TransformOp& transform,
-                    int reduce_num,
-                    const paddle::platform::Place& place,
-                    gpuStream_t stream) {
-  auto reducer = ReduceOp<Ty>();
-  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
-                                                                  transform);
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Reduce(nullptr,
-                            temp_storage_bytes,
-                            trans_x,
-                            y_data,
-                            reduce_num,
-                            reducer,
-                            reducer.initial(),
-                            stream);
-
-  phi::DenseTensor tmp = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      phi::DenseTensorMeta(
-          phi::DataType::UINT8,
-          phi::make_ddim({static_cast<int64_t>(temp_storage_bytes)})));
-
-  auto* temp_storage = tmp.mutable_data<uint8_t>(place);
-
-  cub::DeviceReduce::Reduce(temp_storage,
-                            temp_storage_bytes,
-                            trans_x,
-                            y_data,
-                            reduce_num,
-                            reducer,
-                            reducer.initial(),
-                            stream);
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
-                               void>::type
-CubTensorReduceImpl(const Tx* x_data,
-                    Ty* y_data,
-                    const TransformOp& transform,
-                    int reduce_num,
-                    const paddle::platform::Place& place,
-                    gpuStream_t stream) {
-  PADDLE_THROW(phi::errors::InvalidArgument(
-      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const phi::DenseTensor& x,
-                      phi::DenseTensor* y,
-                      const TransformOp& transform,
-                      const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream) {
-  y->mutable_data<Ty>(x.place());
-
-  auto x_dim = phi::vectorize<int>(x.dims());
-  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
-  config.Run();
-  int numel = x.numel();
-  // after config.run()
-  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
-  // temp_output should be stored temp_data in output_data space or stored in
-  // y_data;
-
-  phi::DDim tmp_ddim;
-  phi::DenseTensor tmp = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(y->place()),
-      phi::DenseTensorMeta(y->dtype(), tmp_ddim, y->layout()));
-
-  auto x_data = x.data<Tx>();
-  auto y_data = y->data<Ty>();
-
-  if (config.reduce_num == 1) {
-    std::vector<const DenseTensor*> inputs = {&x};
-    std::vector<DenseTensor*> outputs = {y};
-    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
-    return;
-  }
-
-  config.SetOutputData(y_data, x.place(), &tmp);
-  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
-  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
-  if (use_cub_reduce) {
-    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
-        x_data, y_data, transform, config.reduce_num, x.place(), stream);
-    return;
-  }
-
-  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
-  auto reducer = ReduceOp<MPType>();
-  // launch ReduceHigherDimKernel
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
-  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
-  //     32
-  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
-  if (config.reduce_type == ReduceType::kReduceHigherDim) {
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.blocking_size,
-                                        0);
-    dim.SetRem(config.left_num % config.block.x,
-               config.reduce_num % config.blocking_size,
-               0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Tx,
-                          Ty,
-                          MPType,
-                          ReduceOp<MPType>,
-                          TransformOp><<<8, 128, stream>>>(x_data,
-                                                           config.output_data,
-                                                           reducer,
-                                                           transform,
-                                                           reducer.initial(),
-                                                           config.reduce_num,
-                                                           config.left_num,
-                                                           config.blocking_size,
-                                                           dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
-
-    if (config.should_reduce_again) {
-      dim3 block = dim3(config.block.x, 1, 1);
-      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
-      kps::DimConfig dim2 =
-          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-      dim2.SetRem(config.left_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#endif
-    }
-    return;
-  }
-
-  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-  // function will be used
-  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
-      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
-}
-
-}  // namespace kernels
 
 template <typename T,
           template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
-void Reduce(const GPUContext& dev_ctx,
+void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
             const std::vector<int64_t>& dims,
@@ -1220,15 +32,13 @@ void Reduce(const GPUContext& dev_ctx,
             DataType out_dtype,
             DenseTensor* out) {
   std::vector<int> reduce_dims =
-      phi::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all);
+      phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);
 
   int reduce_num = 1;
   for (auto i : reduce_dims) {
     reduce_num *= (x.dims())[i];
   }
 
-  gpuStream_t stream = dev_ctx.stream();
-
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
     PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(
@@ -1236,29 +46,23 @@ void Reduce(const GPUContext& dev_ctx,
         phi::DataType::INT64,
         phi::DataType::FLOAT16,
         out_dtype,
-        "TensorReduceImpl",
+        "ReduceKernel",
         ([&] {
           using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
-          phi::kernels::TensorReduceImpl<data_t,
-                                         data_t,
-                                         ReduceOp,
-                                         TransformOp<data_t, MPType>>(
+          phi::funcs::ReduceKernel<data_t,
+                                   data_t,
+                                   ReduceOp,
+                                   TransformOp<data_t, MPType>>(
               dev_ctx,
               tmp_tensor,
               out,
               TransformOp<data_t, MPType>(reduce_num),
-              reduce_dims,
-              stream);
+              reduce_dims);
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::kernels::TensorReduceImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-        dev_ctx,
-        x,
-        out,
-        TransformOp<T, MPType>(reduce_num),
-        reduce_dims,
-        stream);
+    phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
+        dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index a2b1c8631c7b44fefff5871515b77d9a67d992e2..d21c8a3fa46f81c046c722db50ac62fb57cf64f4 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -23,6 +23,7 @@
 #include <set>
 #include <vector>
 
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..98c3986c51dd6829287f5316ae9eb52f328372ab
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..14084d0f4f3c6fbd4edeb335e15704ce2b4e6e15
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reduce_prod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9f4ddc3cf37a744355f6f79b7cd18b3d06b80062
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  auto pt_out_dtype = in_dtype;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+
+  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      kps::IdentityFunctor<T, MPType>(reduce_num));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index d9c8de21c5bc2d26cb371d03be30ed0616a27a64..930c50a24be8fae40535c2d5e6dbbe85e7ced990 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
diff --git a/paddle/phi/kernels/gpu/scatter_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75506e2a0a17b269cb1327c01c1c4f3825eadb37
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s]",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (x_grad) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterGradForX<T, int32_t>(ctx, index, x_grad);
+    } else {
+      phi::funcs::GPUScatterGradForX<T, int64_t>(ctx, index, x_grad);
+    }
+  }
+
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGather<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::GPUGather<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_kernel.cu b/paddle/phi/kernels/gpu/scatter_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..811eae1bc028ec5484d173d1b4373111546d73b4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_kernel.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out) {
+  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  // use template class to support int32_t and int64_t
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op Index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterAssign<T, int32_t>(
+        ctx, updates, index, out, overwrite);
+  } else {
+    phi::funcs::GPUScatterAssign<T, int64_t>(
+        ctx, updates, index, out, overwrite);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71924befe8cf9383c523eee059c3efa79dd6a262
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad) {
+  if (x_grad) {
+    Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  }
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather
+    const auto &index_type = index.dtype();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGatherNd<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::GPUGatherNd<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eadd91773c00810e3f4187d079926028733a4945
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out) {
+  Copy(ctx, x, ctx.GetPlace(), true, out);
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterNdAdd<T, int32_t>(ctx, updates, index, out);
+  } else {
+    phi::funcs::GPUScatterNdAdd<T, int64_t>(ctx, updates, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d9618dc159a6d3f5b24bdfcfdb219ec649e051f9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c38e935adf837ef00c48fa31bc1e37eea2948673
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0ed299413c1726f617dee9a8b5b4bf1d79d30efe
--- /dev/null
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/selu_kernel.cu b/paddle/phi/kernels/gpu/selu_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..99303d8c18a97893a939a8f358bac02603fae329
--- /dev/null
+++ b/paddle/phi/kernels/gpu/selu_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+
+PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f788da010b6827d18ea455bad57d775da4049acf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7dd5a03383fd2a6c84a5b0084dfc97ffeba56022
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename MT>
+__global__ void SGDKernelMT(const T* param,
+                            const T* grad,
+                            const T* learning_rate,
+                            const int num,
+                            T* param_out,
+                            const MT* master_param,
+                            MT* master_param_out) {
+  MT lr = static_cast<MT>(learning_rate[0]);
+  CUDA_KERNEL_LOOP(i, num) {
+    MT p_data = master_param ? master_param[i] : static_cast<MT>(param[i]);
+    MT g_data = static_cast<MT>(grad[i]);
+    p_data = p_data - lr * g_data;
+    param_out[i] = static_cast<T>(p_data);
+    if (master_param_out) {
+      master_param_out[i] = p_data;
+    }
+  }
+}
+
+template <typename T>
+__global__ void SparseSGDFunctorKernel(const T* selected_rows,
+                                       const int64_t* rows,
+                                       const T* learning_rate,
+                                       T* tensor_out,
+                                       int64_t row_numel,
+                                       int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  using MPDType = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  // do check here
+  // if (multi_precision) {
+  //   bool has_master =
+  //       ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+
+  // }
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision
+          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+          : nullptr;
+
+  int block = 512;
+  int grid = (param.numel() + block - 1) / block;
+
+  SGDKernelMT<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
+      param.data<T>(),
+      grad.data<T>(),
+      learning_rate.data<T>(),
+      param.numel(),
+      param_out->mutable_data<T>(dev_ctx.GetPlace()),
+      master_in_data,
+      master_out_data);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  using MPDType = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  // do some check here
+  // if (multi_precision) {
+  //   bool has_master =
+  //       ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+
+  // }
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision
+          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+          : nullptr;
+
+  PADDLE_ENFORCE_EQ(
+      &param,
+      param_out,
+      phi::errors::InvalidArgument(
+          "The input tensor Param of SgdOp should be equal with ParamOut "
+          "if variable's type is SelectedRows."));
+
+  auto in_height = grad.height();
+  auto out_dims = param_out->dims();
+  PADDLE_ENFORCE_EQ(in_height,
+                    out_dims[0],
+                    phi::errors::InvalidArgument(
+                        "The input tensor Grad's height of SgdOp should be "
+                        "equal with ParamOut's dims. But received Grad's "
+                        "height [%s] and ParamOut's dims [%s]",
+                        in_height,
+                        out_dims[0]));
+
+  auto& in_value = grad.value();
+  auto& in_rows = grad.rows();
+
+  int64_t in_row_numel = in_value.numel() / in_rows.size();
+  PADDLE_ENFORCE_EQ(in_row_numel,
+                    param_out->numel() / in_height,
+                    phi::errors::InvalidArgument(
+                        "The in_row_numel of SgdOp should be equal with "
+                        "param_out's numel / in_height."));
+
+  auto* in_data = in_value.data<T>();
+  auto* out_data = param_out->data<T>();
+
+  const int kThreadsPerBlock = 256;
+  int thread_x = kThreadsPerBlock;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+  paddle::framework::MixVector<int64_t> mixv_in_rows(&in_rows);
+  SparseSGDFunctorKernel<<<max_blocks, thread_x, 0, dev_ctx.stream()>>>(
+      in_data,
+      mixv_in_rows.CUDAData(dev_ctx.GetPlace()),
+      learning_rate.data<T>(),
+      out_data,
+      in_row_numel,
+      in_rows.size());
+}
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out) {
+  PADDLE_THROW("not impl");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_sparse_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDSparseParamSparseGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/gpu/shape_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..39b6eaeaef2a8e80d204941dc1f3ac92907aa786
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shape_kernel.cu
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/shape_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+PD_REGISTER_KERNEL(shape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0bd7b93f68928ae268feb2aad9c2aa0304ca4790
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void ShardIndexInner(const T* in_data,
+                                T* out_data,
+                                const int64_t numel,
+                                const int index_num,
+                                const int nshards,
+                                const int shard_id,
+                                const int ignore_value) {
+  int shard_size = (index_num + nshards - 1) / nshards;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
+    if (in_data[idx] / shard_size == shard_id) {
+      out_data[idx] = in_data[idx] % shard_size;
+    } else {
+      out_data[idx] = ignore_value;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(nshards,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The value 'nshard' for Op(shard_index) must be "
+                        "greater than 0, but the value given is %d.",
+                        nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  auto stream = dev_ctx.stream();
+  ShardIndexInner<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(
+      in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, GPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f9cda83a9a9876db302aa16c823beb3ba78b60b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+namespace phi {
+
+#ifdef __HIPCC__
+static constexpr int kNumCUDAThreads = 256;
+#else
+static constexpr int kNumCUDAThreads = 512;
+#endif
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+struct NonzeroFunctor {
+  HOSTDEVICE explicit inline NonzeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T x) const {
+    return static_cast<T>(static_cast<double>(x) != 0);
+  }
+};
+
+template <typename T>
+struct DivFunctor {
+  const T norm_;
+  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
+
+  HOSTDEVICE inline T operator()(T loss) {
+    loss /= norm_;
+    return loss;
+  }
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f61cd2c39674ecdad5553e8472fb85da73ebd534
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+
+#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+
+namespace phi {
+
+template <typename T>
+struct SigmoidBwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x,
+                                                const T label,
+                                                const T dout) {
+    T counts;
+    T dx_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data = static_cast<T>(0.);
+      counts = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) /
+                   (static_cast<T>(1) + paddle::operators::real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data = dout * diff;
+      counts = 1;
+    }
+    phi::Array<T, 2> outs;
+
+    outs[0] = dx_data;
+    outs[1] = counts;
+    return outs;
+  }
+};
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
+                                             const DenseTensor &x,
+                                             const DenseTensor &label,
+                                             const DenseTensor &out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor *in_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(in_grad);
+
+  // Temporary memory
+  DenseTensor *counts_tensor = new DenseTensor();
+
+  int64_t out_dims = label.numel() * sizeof(T);
+  counts_tensor->Resize({out_dims});
+  dev_ctx.template Alloc<T>(counts_tensor);
+  counts_tensor->Resize(in_grad->dims());
+
+  std::vector<const DenseTensor *> ins = {&x, &label, &out_grad};
+  std::vector<DenseTensor *> outs = {in_grad, counts_tensor};
+  auto functor = SigmoidBwdFunctor<T>(ignore_index);
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
+      dev_ctx, ins, &outs, functor);
+  if (normalize) {
+    DenseTensor *norm_tensor = new DenseTensor();
+    norm_tensor->Resize({sizeof(T)});
+    dev_ctx.template Alloc<T>(norm_tensor);
+    auto dims = phi::vectorize(counts_tensor->dims());
+    std::vector<int> reduce_dim = {};
+    for (int i = 0; i < dims.size(); i++) {
+      reduce_dim.push_back(i);
+    }
+
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
+    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
+    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         norm_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         norm,
+                         sizeof(T),
+                         dev_ctx.stream());
+    dev_ctx.Wait();
+    auto eps = static_cast<T>(1e-5);
+    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+    std::vector<const DenseTensor *> div_ins = {in_grad};
+    std::vector<DenseTensor *> div_outs = {in_grad};
+    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
+    delete norm_tensor;
+  }
+  delete counts_tensor;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0e9efe5bbafe671ae6f594564260895a2015a45
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+
+#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+
+namespace phi {
+
+template <typename T>
+struct SigmoidFwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
+    T counts;
+    T out_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data = static_cast<T>(0.);
+      counts = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = paddle::operators::real_log(
+          static_cast<T>(1) +
+          paddle::operators::real_exp(static_cast<T>(-abs(x))));
+
+      out_data = term1 - term2 + term3;
+      counts = 1;
+    }
+    phi::Array<T, 2> outs;
+
+    outs[0] = out_data;
+    outs[1] = counts;
+    return outs;
+  }
+};
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
+                                         const DenseTensor &x,
+                                         const DenseTensor &label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor *out) {
+  auto out_data = dev_ctx.template Alloc<T>(out);
+
+  // Temporary memory
+  DenseTensor *counts_tensor = new DenseTensor();
+
+  int64_t out_dims = label.numel() * sizeof(T);
+  counts_tensor->Resize({out_dims});
+  dev_ctx.template Alloc<T>(counts_tensor);
+  counts_tensor->Resize(out->dims());
+
+  std::vector<const DenseTensor *> ins = {&x, &label};
+  std::vector<DenseTensor *> outs = {out, counts_tensor};
+  auto functor = SigmoidFwdFunctor<T>(ignore_index);
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
+      dev_ctx, ins, &outs, functor);
+  if (normalize) {
+    DenseTensor *norm_tensor = new DenseTensor();
+    norm_tensor->Resize({sizeof(T)});
+    dev_ctx.template Alloc<T>(norm_tensor);
+    auto dims = phi::vectorize(counts_tensor->dims());
+    std::vector<int> reduce_dim = {};
+    for (int i = 0; i < dims.size(); i++) {
+      reduce_dim.push_back(i);
+    }
+
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
+    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
+    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         norm_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         norm,
+                         sizeof(T),
+                         dev_ctx.stream());
+    dev_ctx.Wait();
+    auto eps = static_cast<T>(1e-5);
+    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+    std::vector<const DenseTensor *> div_ins = {out};
+    std::vector<DenseTensor *> div_outs = {out};
+    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
+    delete norm_tensor;
+  }
+  delete counts_tensor;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
index aa496d3cd391b59bef16c57dc8b7f0c39834c107..04052e0dfc39a44a0f485557b9e8dc57b8794c38 100644
--- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax_grad,
                    phi::SoftmaxGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 32efb9b776419efe5733ab0493c38f9c1a9c237e..03c5714b967841ef1bd124bd9191830a79567514 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax,
                    phi::SoftmaxRawKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index a698b9e716140b59b10a5799647e0a1aa7a8261d..c28fc3794f092a4cee8d7fc351190c13291892b1 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -30,11 +30,14 @@ void SplitKernel(const Context& dev_ctx,
   // need to infershape output
   if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
     std::vector<MetaTensor> out_metas;
+    out_metas.reserve(outs.size());
+    std::vector<MetaTensor*> out_metas_ptr;
     for (size_t i = 0; i < outs.size(); ++i) {
       out_metas.push_back(outs[i]);
+      out_metas_ptr.push_back(&out_metas.back());
     }
 
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
 
     for (size_t i = 0; i < out_metas.size(); ++i) {
       outs[i]->Resize(out_metas[i].dims());
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e09cfd370a4f0d87aaa081eaf50d519f536d2a72
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9665a917d9dc4a5fa0d350f8fb635ecec79b7832
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU device."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace  phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c092609e623d3f4f3dc4b3d77b1c973e6ddfbcf3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0c3c29e82c42aefae33a4a9be9e9a7d9ec0c1e99
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0b45223489e93dedf20d2ffa019a87a0a0e88bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // get the real the axis and the k
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+  const int& raw_height = in_dims[axis];
+
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  ops::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+
+  // lanuch the cuda kernel to assign the grad
+  ops::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4e9aa88c6cb2da7fabe3f5d841a313e82b9ebed2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -0,0 +1,264 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // get the input dims
+  const auto& in_dims = input->dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    phi::DDim out_dims = out->dims();
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  const auto& out_dims = out->dims();
+
+  const T* input_data = input->data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    // if get the topK from the last axis
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+
+    if (k > input_width) {
+      k = input_width;
+    }
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      if (ops::SortTopk<T>(
+              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
+              input,
+              input_width,
+              input_height,
+              k,
+              out,
+              indices,
+              largest)) {
+        // Successed, return.
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+  } else {
+    // if get topK not from the last axis, will tranpose the tensor and get
+    // TopK
+
+    // first step, prepare the trans args for the tranpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, *input, &trans_input, trans);
+    // third step, calcluate the topk
+    // allocate the tmp cuda memory for the tmp result
+    DenseTensor trans_ind;
+    DenseTensor trans_out;
+    trans_ind.Resize(trans_out_dims);
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    if (k > input_width) k = input_width;
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      if (ops::SortTopk<T>(
+              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
+              &trans_input,
+              input_width,
+              input_height,
+              k,
+              &trans_out,
+              &trans_ind,
+              largest)) {
+        // last step, tranpose back the indices and output
+        funcs::TransCompute<phi::GPUContext, int64_t>(
+            ndims, dev_ctx, trans_ind, indices, trans);
+        funcs::TransCompute<phi::GPUContext, T>(
+            ndims, dev_ctx, trans_out, out, trans);
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, out, trans);
+  }
+}
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 7ac7c451b00542c3e0511692dc7cad470374f2ae..4a749c5b3347da24c8aba35d33673801b4b7e407 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
@@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx,
   T* out_data = ctx.template Alloc<T>(out);
   auto diag = funcs::Diagonal<T, Context>(ctx, &x, offset, axis1, axis2);
   if (diag.numel() > 0) {
-    auto stream = ctx.stream();
     std::vector<int> reduce_dims;
     reduce_dims.push_back(out->dims().size());
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims);
   } else {
     phi::funcs::SetConstant<Context, T> functor;
     functor(ctx, out, static_cast<T>(0));
diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0687dc0c200a85a8179a2da02f87e331545fd48c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ea2af292ccf161653b5674bd231aed584b84632
--- /dev/null
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/fluid/framework/gpu_utils.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+namespace phi {
+template <typename T, typename Context>
+void TransposeKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  int rank = axis.size();
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+  paddle::operators::TransposeGPUKernelDriver<T>(ctx, rank, x, axis, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7eaa485797947ae7b6a60378737d8c955718466
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f137d8e1c260387686cbf3d0fbadf686d9e13019
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // calculate use cublas library
+  CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
+  CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
+  CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
+
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  auto lda = std::max(1, M);
+  auto ldb = std::max(1, N);
+
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
+  if (batch_size <= 8 && M >= 64) {
+    for (auto i = 0; i < batch_size; i++) {
+      blas.TRSM(CblasLeft,
+                uplo,
+                transA,
+                diag,
+                M,
+                N,
+                T(1),
+                x_bst_data + i * M * M,
+                lda,
+                out_data + i * N * M,
+                ldb);
+    }
+  } else {
+    std::vector<const T*> cpu_ptrs(batch_size * 2);
+    for (int i = 0; i < batch_size; ++i) {
+      cpu_ptrs[i] = x_bst_data + i * M * M;
+      cpu_ptrs[i + batch_size] = out_data + i * M * N;
+    }
+
+    // Copy the addresses of A and tmp_b from host to device.
+    paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+        paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_ptrs_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         static_cast<void*>(cpu_ptrs.data()),
+                         cpu_ptrs.size() * sizeof(T*),
+                         dev_ctx.stream());
+
+    const T** gpu_a_ptrs =
+        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
+    T** gpu_b_ptrs =
+        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+    blas.BatchedTRSM(CblasLeft,
+                     uplo,
+                     transA,
+                     diag,
+                     M,
+                     N,
+                     static_cast<T>(1.0),
+                     gpu_a_ptrs,
+                     lda,
+                     gpu_b_ptrs,
+                     ldb,
+                     batch_size);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 12c1bf791e1691bb6eee81750b337adea713b794..f27b32ca7b8319440b62f0d03d21129133c8470c 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -25,7 +25,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/fluid/framework/generator.h"
-// #include "paddle/phi/core/generator.h"
 
 namespace phi {
 
@@ -87,7 +86,7 @@ struct TruncatedNormalOffset {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cdab9faf6aafe35045060a011e1354c11a4c9375
--- /dev/null
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+
+#include "gflags/gflags.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  __host__ __device__ UniformGenerator(
+      T min, T max, int seed, int diag_num, int diag_step, T diag_val)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min,
+                                             T max,
+                                             int seed,
+                                             int diag_num,
+                                             int diag_step,
+                                             T diag_val,
+                                             int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor* out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  T* data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  auto generator = dev_ctx.GetGenerator();
+  if (generator->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename kps::details::MPTypeTrait<T>::Type;
+      funcs::uniform_distribution<MT> dist;
+      funcs::uniform_real_transform<MT> trans(min, max);
+      funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
+    } else {
+      auto seed_offset = generator->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func = UniformGeneratorOffset<T>(min,
+                                            max,
+                                            seed_offset.first,
+                                            diag_num,
+                                            diag_step,
+                                            diag_val,
+                                            gen_offset);
+      IndexKernel<T, UniformGeneratorOffset<T>>(dev_ctx, out, func);
+    }
+  } else {
+    auto func =
+        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor* out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(
+    uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25d6d46c20b9fd1f58a1299a82214641d10a4149
--- /dev/null
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -0,0 +1,402 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+int64_t ComputeBlockSize(int64_t col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256)
+    return 512;
+  else if (col > 128)
+    return 256;
+  else if (col > 64)
+    return 128;
+  else if (col > 32)
+    return 64;
+  else if (col > 16)
+    return 32;
+  else if (col > 8)
+    return 16;
+  else
+    return 8;
+}
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    std::vector<const DenseTensor*> ins{&lhs, &rhs};
+    std::vector<DenseTensor*> outs{output};
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                   T,
+                                                   T>(
+        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    std::vector<const DenseTensor*> ins = {&lhs, &rhs};
+    std::vector<DenseTensor*> outs = {mask};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
+  }
+};
+
+template <typename T, typename IndType, size_t BlockDim>
+__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
+                                 const int64_t width,      // c
+                                 const int64_t post_size,  // h
+                                 const T* in,
+                                 IndType* out_idx,
+                                 T* out) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  cub::ArgMax reducer;
+  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    cub::KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      // return max, argmax
+      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
+      if (out != nullptr) out[idx] = kv_pair.value;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int start = idx; idx < num; idx += gridDim.x) {
+    data[idx] = idx * scale;
+  }
+}
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int num,
+                  int64_t scale) {
+    int64_t kBlockDim = ComputeBlockSize(num);
+    // kBlockDim > num at most of time, so we can set grid = 1
+    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
+  }
+};
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t numel = input.numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    auto cu_stream = dev_ctx.stream();
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = pre * post;
+    int64_t width = n;
+    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           IndType,
+                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, in_data, out_idx_data, out_data));
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    DenseTensor out_data;
+    out_data.Resize(phi::make_ddim({1}));
+    dev_ctx.template Alloc<T>(&out_data);
+    switch (ComputeBlockSize(input.numel())) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           T,
+                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
+              1,
+              input.numel(),
+              1,
+              input.data<int64_t>(),
+              nullptr,
+              out_data.data<int64_t>()));
+    }
+    DenseTensor max_value_tensor;
+    phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
+    *max_value = max_value_tensor.data<T>()[0];
+  }
+};
+
+template <typename Context, typename T, typename IndexT>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, GPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f21aca80e21b30de8931b4fcd4ae3922be959958
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_grad_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void WhereGradCUDAKernel(
+    const int N, const T* dout, const bool* cond, T* dx, T* dy) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < N; idx += blockDim.x * gridDim.x) {
+    if (dx != nullptr) {
+      dx[idx] = cond[idx] ? dout[idx] : 0.;
+    }
+    if (dy != nullptr) {
+      dy[idx] = cond[idx] ? 0. : dout[idx];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  const bool* cond_data = condition.data<bool>();
+  auto numel = condition.numel();
+  auto* dout = out_grad.data<T>();
+
+  T* dx = (x_grad != nullptr) ? ctx.template Alloc<T>(x_grad) : nullptr;
+  T* dy = (y_grad != nullptr) ? ctx.template Alloc<T>(y_grad) : nullptr;
+
+  auto stream = ctx.stream();
+  auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
+  WhereGradCUDAKernel<
+      T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+      numel, dout, cond_data, dx, dy);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..535cb812a20ea90bdb3f07b731af52c2822f0ec2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GetTrueNum(const T *cond_data,
+                           const int64_t numel,
+                           int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    true_num_array[idx] =
+        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+  }
+}
+
+template <typename T>
+__global__ void SetTrueIndex(int64_t *out_ptr,
+                             const T *cond_data,
+                             const int64_t numel,
+                             const int64_t *stride_array,
+                             const int64_t rank,
+                             const int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    // true_num_array is calculated by cub::InclusiveSum,
+    // cause the first element of true_num_array is 1,
+    // so we need substract 1 to get true index.
+    const int64_t true_index = true_num_array[idx] - 1;
+    if (static_cast<bool>(cond_data[idx])) {
+      int64_t rank_index = idx;
+      for (int j = 0; j < rank; j++) {
+        const int64_t out_index = rank_index / stride_array[j];
+        out_ptr[true_index * rank + j] = out_index;
+        rank_index -= out_index * stride_array[j];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context &dev_ctx,
+                      const DenseTensor &condition,
+                      DenseTensor *out) {
+  const T *cond_data = condition.data<T>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  auto d_array_mem =
+      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
+  auto h_array_mem =
+      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
+
+  // "stride_array" is an array and len(stride_array)==rank,
+  // each element is the stride of each dimension -- the length from i to i+1.
+  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
+  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
+
+  // "true_num_array" is an array and len(stride_array)==numel,
+  // at the beginning,
+  // "true_num_array" will set 1 if condition[i] == true else 0,
+  // then it will be calculated by cub::InclusiveSum,
+  // so that we can get the true number before i as the out index
+  int64_t *d_true_num_array = d_stride_array + rank;
+
+  // the total_true_num is the total number of condition[i] == true
+  int64_t *h_total_true_num = h_stride_array + rank;
+
+  // alloce cub memory
+  size_t cub_size = 0;
+  cub::DeviceScan::InclusiveSum(nullptr,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
+  void *cub_data = cub_mem->ptr();
+
+  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
+  const int threads = std::min(numel, static_cast<int64_t>(128));
+  const int64_t need_grids = (numel + threads - 1) / threads;
+  const int grids = std::min(need_grids, static_cast<int64_t>(256));
+  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      cond_data, numel, d_true_num_array);
+
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  cub::DeviceScan::InclusiveSum(cub_data,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+
+  // calculate each dimension's stride
+  h_stride_array[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
+  }
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       d_stride_array,
+                       phi::CPUPlace(),
+                       h_stride_array,
+                       rank * sizeof(int64_t),
+                       dev_ctx.stream());
+
+  // get total ture number and set output size
+  // the last element of cub::InclusiveSum is the total number
+  paddle::memory::Copy(phi::CPUPlace(),
+                       h_total_true_num,
+                       dev_ctx.GetPlace(),
+                       d_true_num_array + numel - 1,
+                       sizeof(int64_t),
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+
+  int64_t true_num = *h_total_true_num;
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  // using true_num_array and stride_array to calculate the output index
+  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..03c24eea3a95af1ed57f5c8df42b01fd09af1fa2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_kernel.h"
+
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+// Cond
+template <typename T>
+struct CondFunctor {
+  inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const {
+    return cond ? x : y;
+  }
+};
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  std::vector<const DenseTensor*> ins = {&condition, &x, &y};
+  std::vector<DenseTensor*> outs = {out};
+  ctx.template Alloc<T>(out);
+
+  CondFunctor<T> func;
+  funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+      ctx, ins, &outs, -1, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2719dcd9e54094f2b6af0ad18eabb445081d60a6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input,
+                            const int* imgsize,
+                            T* boxes,
+                            T* scores,
+                            const float conf_thresh,
+                            const int* anchors,
+                            const int n,
+                            const int h,
+                            const int w,
+                            const int an_num,
+                            const int class_num,
+                            const int box_num,
+                            int input_size_h,
+                            int input_size_w,
+                            bool clip_bbox,
+                            const float scale,
+                            const float bias,
+                            bool iou_aware,
+                            const float iou_aware_factor) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 4, iou_aware);
+    T conf = funcs::sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx =
+          funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = funcs::sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 0, iou_aware);
+    funcs::GetYoloBox<T>(box,
+                         input,
+                         anchors,
+                         l,
+                         k,
+                         j,
+                         h,
+                         w,
+                         input_size_h,
+                         input_size_w,
+                         box_idx,
+                         grid_num,
+                         img_height,
+                         img_width,
+                         scale,
+                         bias);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    funcs::CalcDetectionBox<T>(
+        boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 5, iou_aware);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    funcs::CalcLabelScore<T>(
+        scores, input, label_idx, score_idx, class_num, conf, grid_num);
+  }
+}
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  int bytes = sizeof(int) * anchors.size();
+  auto anchors_ptr =
+      paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
+  int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+  const auto gplace = dev_ctx.GetPlace();
+  const auto cplace = phi::CPUPlace();
+  paddle::memory::Copy(
+      gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = img_size.data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+  set_zero(dev_ctx, boxes, static_cast<T>(0));
+  set_zero(dev_ctx, scores, static_cast<T>(0));
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * box_num);
+
+  dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+  if (config.compute_capability == 53 || config.compute_capability == 62) {
+    thread_num = 512;
+  }
+#endif
+
+  KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
+      input_data,
+      imgsize_data,
+      boxes_data,
+      scores_data,
+      conf_thresh,
+      anchors_data,
+      n,
+      h,
+      w,
+      an_num,
+      class_num,
+      box_num,
+      input_size_h,
+      input_size_w,
+      clip_bbox,
+      scale,
+      bias,
+      iou_aware,
+      iou_aware_factor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, GPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b4a6fe337c8d21e37beb0d6e5219e1a5edf1f9e8
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -0,0 +1,834 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  auto X = &input;
+  auto W = &filter;
+  auto dO = &out_grad;
+  auto ddX = input_grad_grad.get_ptr();
+  auto ddW = filter_grad_grad.get_ptr();
+
+  auto ddO = out_grad_grad;
+  auto dW = filter_grad;
+  auto dX = input_grad;
+  if (ddO) {
+    ddO->mutable_data<T>(ctx.GetPlace());
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, ddO, static_cast<T>(0));
+  }
+  if (dW) {
+    dW->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dX) {
+    dX->mutable_data<T>(ctx.GetPlace());
+  }
+
+  // const T* x = X->data<T>();
+  const T* dy = dO->data<T>();
+  const T* w = W->data<T>();
+
+  const T* ddx = nullptr;
+  const T* ddw = nullptr;
+  T *dw, *dx, *ddy;
+  dw = dx = ddy = nullptr;
+  T* transformed_dx = nullptr;
+  std::vector<int> dilations = dilations_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  std::vector<int> paddings = paddings_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensors to channel first-----------
+  DenseTensor transformed_X_channel(X->type());
+  DenseTensor transformed_dO_channel(dO->type());
+  DenseTensor transformed_ddX_channel(X->type());
+
+  DenseTensor transformed_ddO_channel(dO->type());
+  DenseTensor transformed_dX_channel(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+    TransToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+    TransToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+      TransToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+    }
+
+    if (ddO) {
+      ResizeToChannelFirst<Context, T>(ctx, ddO, &transformed_ddO_channel);
+    }
+    if (dX) {
+      ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
+      transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
+    }
+
+  } else {
+    transformed_X_channel = *X;
+    transformed_dO_channel = *dO;
+    if (ddX) {
+      transformed_ddX_channel = *ddX;
+    }
+    if (ddO) {
+      transformed_ddO_channel.ShareDataWith(*ddO);
+    }
+    if (dX) {
+      transformed_dX_channel.ShareDataWith(*dX);
+    }
+  }
+
+  auto in_dims = transformed_X_channel.dims();
+  auto filter_dims = W->dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_ddX(X->type());
+
+  DenseTensor transformed_dX(X->type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(X->dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_X_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_X_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_X_channel.dims()[i + 2] + padding_diff[i];
+      input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_X.Resize(new_input_shape);
+    transformed_ddX.Resize(new_input_shape);
+    transformed_dX.Resize(new_input_shape);
+
+    transformed_X.mutable_data<T>(ctx.GetPlace());
+
+    if (ddX) {
+      transformed_ddX.mutable_data<T>(ctx.GetPlace());
+    }
+    if (dX) {
+      transformed_dX.mutable_data<T>(ctx.GetPlace());
+    }
+
+    // pad for input
+    const int rank = X->dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 4>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 5>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+
+  } else {
+    transformed_X.ShareDataWith(transformed_X_channel);
+    if (ddX) {
+      transformed_ddX.ShareDataWith(transformed_ddX_channel);
+    }
+    if (dX) {
+      transformed_dX.ShareDataWith(transformed_dX_channel);
+    }
+
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* x = transformed_X.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  auto handle = ctx.cudnn_handle();
+
+  paddle::operators::ConvArgs args1{&transformed_ddX,
+                                    W,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_X,
+                                    ddW,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args3{&transformed_ddX,
+                                    dW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args4{&transformed_dX,
+                                    ddW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionFwdAlgo_t fwd_algo1 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionFwdAlgo_t fwd_algo2 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
+
+  // ddo = conv(ddI, W) + conv(I, ddW)
+  size_t workspace_size = 0;
+
+  T* transformed_ddy_channel = nullptr;
+  if (ddO) {
+    ddy = ddO->data<T>();
+    transformed_ddy_channel = transformed_ddO_channel.data<T>();
+    if (ddX) {
+      args1.handle = handle;
+      args1.idesc.set(transformed_ddX, iwo_group);
+      args1.wdesc.set(*W, layout, iwo_group);
+      args1.odesc.set(transformed_ddO_channel, iwo_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+
+#ifdef PADDLE_WITH_HIP
+      using search1 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size = search1::GetWorkspaceSize(args1);
+      fwd_algo1 = search1::Find<T>(
+          args1, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search1 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+#endif
+    }
+
+    if (ddW) {
+      ddw = ddW->data<T>();
+      args2.handle = handle;
+      args2.idesc.set(transformed_X, iwo_group);
+      args2.wdesc.set(*ddW, layout, iwo_group);
+      args2.odesc.set(transformed_ddO_channel, iwo_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+
+#ifdef PADDLE_WITH_HIP
+      using search2 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      fwd_algo2 = search2::Find<T>(
+          args2, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search2 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2));
+#endif
+    }
+  }
+
+  if (dW && ddX) {
+    dw = dW->data<T>();
+    args3.handle = handle;
+    args3.idesc.set(transformed_ddX, iwo_group);
+    args3.wdesc.set(*dW, layout, iwo_group);
+    args3.odesc.set(transformed_dO_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+
+#ifdef PADDLE_WITH_HIP
+    using search3 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_algo = search3::Find<T>(
+        args3, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search3 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+#endif
+  }
+
+  if (ddW && dX) {
+    transformed_dx = transformed_dX.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dX, iwo_group);
+    args4.wdesc.set(*ddW, layout, iwo_group);
+    args4.odesc.set(transformed_dO_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+
+#ifdef PADDLE_WITH_HIP
+    using search4 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    data_algo = search4::Find<T>(
+        args4, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search4 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(
+      transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dO_channel.dims(),
+           DataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = W->numel() / groups;
+
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+  // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+  // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+  // 0.0f;
+  // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+  auto wkspace_handle = ctx.cudnn_workspace_handle();
+
+  if (ddO) {
+    if (ddX) {
+      ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args1.idesc.desc(),
+                    ddx,
+                    args1.wdesc.desc(),
+                    w,
+                    args1.cdesc.desc(),
+                    fwd_algo1,
+                    &beta,
+                    args1.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args1.idesc.desc(),
+                      ddx + i * group_offset_in,
+                      args1.wdesc.desc(),
+                      w + i * group_offset_filter,
+                      args1.cdesc.desc(),
+                      fwd_algo1,
+                      workspace_ptr,
+                      workspace_size,
+                      &beta,
+                      args1.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (ddW) {
+#ifdef PADDLE_WITH_HIP
+      // MIOPEN ONLY support beta to be 0.0f
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    x,
+                    args2.wdesc.desc(),
+                    ddw,
+                    args2.cdesc.desc(),
+                    fwd_algo2,
+                    &beta,
+                    args2.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args2.idesc.desc(),
+                      x + i * group_offset_in,
+                      args2.wdesc.desc(),
+                      ddw + i * group_offset_filter,
+                      args2.cdesc.desc(),
+                      fwd_algo2,
+                      workspace_ptr,
+                      workspace_size,
+                      &alpha,
+                      args2.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_ddO_channel, ddO);
+    }
+  }
+  T* transformed_dy_channel = transformed_dO_channel.data<T>();
+  if (dW && ddX) {
+    ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args3.odesc.desc(),
+                  transformed_dy_channel,
+                  args3.idesc.desc(),
+                  ddx,
+                  args3.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args3.wdesc.desc(),
+                  dw,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args3.idesc.desc(),
+                    ddx + i * group_offset_in,
+                    args3.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dw + i * group_offset_filter));
+          },
+          workspace_size);
+    }
+#endif
+  }
+
+  if (dX && ddW) {
+    ddw = ddW->data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardData(
+                  handle,
+                  &alpha,
+                  args4.odesc.desc(),
+                  transformed_dy_channel,
+                  args4.wdesc.desc(),
+                  ddw,
+                  args4.cdesc.desc(),
+                  data_algo,
+                  &beta,
+                  args4.idesc.desc(),
+                  transformed_dx,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args4.wdesc.desc(),
+                    ddw + i * group_offset_filter,
+                    args4.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args4.cdesc.desc(),
+                    data_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args4.idesc.desc(),
+                    transformed_dx + i * group_offset_in));
+          },
+          workspace_size);
+    }
+#endif
+
+    if (!is_sys_pad) {
+      // reverse padded input
+      std::vector<int> starts(X->dims().size(), 0);
+      std::vector<int> axes(X->dims().size(), 0);
+
+      for (size_t i = 0; i < X->dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+      if (X->dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_dX_channel, dX);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void DepthwiseConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    bool fuse_relu,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+#else
+
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+#endif
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64148e902fdb2123aa3f81846999b5d90f356cd6
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -0,0 +1,683 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnGradKernel(const Context& ctx,
+                         const DenseTensor& output_grad,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search_t,
+                         DenseTensor* input_grad,
+                         DenseTensor* filter_grad) {
+  if (input_grad) {
+    input_grad->mutable_data<T>(ctx.GetPlace());
+  }
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(ctx.GetPlace());
+  }
+
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvGradOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+
+  // transform Tensor
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output_grad_channel(output_grad.type());
+  DenseTensor transformed_input_grad_channel(input.type());
+  DenseTensor transformed_filter_channel(filter.type());
+  DenseTensor transformed_filter_grad_channel(filter.type());
+
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
+               "NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+
+    ResizeToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+    TransToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+
+    if (input_grad) {
+      ResizeToChannelFirst<Context, T>(
+          ctx, input_grad, &transformed_input_grad_channel);
+      // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
+      // the data of input_grad to transformed_input_grad_channel.
+      if (use_addto) {
+        TransToChannelFirst<Context, T>(
+            ctx, input_grad, &transformed_input_grad_channel);
+      }
+    }
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output_grad_channel.ShareDataWith(output_grad);
+    if (input_grad) {
+      transformed_input_grad_channel.ShareDataWith(*input_grad);
+    }
+  }
+
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+
+    if (filter_grad) {
+      ResizeToChannelLast<Context, T>(
+          ctx, filter_grad, &transformed_filter_grad_channel);
+    }
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+    if (filter_grad) {
+      transformed_filter_grad_channel.ShareDataWith(*filter_grad);
+    }
+  }
+
+  //  update paddings
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  // cuDNN only supports padding the same amount on every dimension.
+  // So we create a new padded input tensor.
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  Tensor transformed_input(input.type());
+  Tensor transformed_input_grad(input.type());
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+
+    transformed_input_grad.Resize(new_input_shape);
+
+    if (input_grad) {
+      transformed_input_grad.mutable_data<T>(ctx.GetPlace());
+    }
+    // pad for input
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (input_grad) {
+      transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
+    }
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* input_data = transformed_input.data<T>();
+  const T* output_grad_data = transformed_output_grad_channel.data<T>();
+  const T* filter_data = transformed_filter_channel.data<T>();
+  T* filter_grad_data = nullptr;
+  T* input_grad_data = nullptr;
+  T* transformed_input_grad_data = nullptr;
+
+  paddle::operators::ConvArgs args1{&transformed_input_grad,
+                                    &transformed_filter_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_input,
+                                    &transformed_filter_grad_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+
+  auto handle = ctx.cudnn_handle();
+  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+// ------------------- cudnn backward algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+  // input data workspace_size
+  size_t workspace_size_d = 0;
+  // weight workspace_size
+  size_t workspace_size_w = 0;
+  int iwo_groups = groups;
+  int c_groups = 1;
+
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  if (input_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    input_grad_data = input_grad->data<T>();
+    transformed_input_grad_data = transformed_input_grad.data<T>();
+
+    args1.handle = handle;
+    args1.idesc.set(transformed_input_grad, layout_tensor);
+    args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
+    args1.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
+    data_algo = search1::Find<T>(
+        args1, exhaustive_search, deterministic, workspace_size_d, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo));
+#endif
+  }
+
+  if (filter_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    filter_grad_data = transformed_filter_grad_channel.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_input, layout_tensor);
+    args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups);
+    args2.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size_w =
+        std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
+    filter_algo = search2::Find<T>(
+        args2, exhaustive_search, deterministic, workspace_size_w, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
+    workspace_size_w = std::max(workspace_size_w,
+                                search2::GetWorkspaceSize(args2, filter_algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN ONLY support beta to be 0.0f
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+#else
+  paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+
+#endif
+  VLOG(4) << "Conv_grad: use_addto = " << use_addto;
+
+  if (input_grad) {
+// When beta is 0, it is unnecessary to reset input_grad.
+// When beta is 1, the output cannot be reset since addt strategy used.
+#ifdef PADDLE_WITH_HIP
+    if (use_addto) {
+      DenseTensor temp_tensor(transformed_input_grad.type());
+      temp_tensor.Resize(transformed_input_grad.dims());
+      T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    temp_tensor_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args1.idesc.desc(),
+          transformed_input_grad_data,
+          &alpha,
+          args1.idesc.desc(),
+          temp_tensor_data,
+          &beta,
+          args1.idesc.desc(),
+          transformed_input_grad_data));
+    } else {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+    }
+
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.wdesc.desc(),
+                    filter_data + i * group_offset_filter,
+                    args1.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_d,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data + i * group_offset_in));
+          },
+          workspace_size_d);
+    }
+#endif
+    if (!is_sys_pad) {
+      std::vector<int> starts(transformed_input_channel.dims().size(), 0);
+      std::vector<int> axes(transformed_input_channel.dims().size(), 0);
+
+      for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+
+      transformed_input_grad_channel.mutable_data(ctx.GetPlace());
+      if (transformed_input_channel.dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      }
+    }
+
+    if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+      TransToChannelLast<Context, T>(
+          ctx, &transformed_input_grad_channel, input_grad);
+    }
+  }
+
+  // filter_grad do not use inplace addto.
+  paddle::operators::ScalingParamType<T> beta_filter = 0.0f;
+  // ------------------- cudnn conv backward filter ---------------------
+  if (filter_grad) {
+// Because beta is zero, it is unnecessary to reset filter_grad.
+#ifdef PADDLE_WITH_HIP
+    workspace_handle.RunFunc(
+        [&](void* cudnn_workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args2.odesc.desc(),
+                  output_grad_data,
+                  args2.idesc.desc(),
+                  input_data,
+                  args2.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args2.wdesc.desc(),
+                  filter_grad_data,
+                  cudnn_workspace_ptr,
+                  workspace_size_w));
+        },
+        workspace_size_w);
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    input_data + i * group_offset_in,
+                    args2.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args2.cdesc.desc(),
+                    filter_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_w,
+                    &beta_filter,
+                    args2.wdesc.desc(),
+                    filter_grad_data + i * group_offset_filter));
+          },
+          workspace_size_w);
+    }
+#endif
+
+    if (compute_format == paddle::platform::DataLayout::kNHWC) {
+      TransToChannelFirst<Context, T>(
+          ctx, &transformed_filter_grad_channel, filter_grad);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnGradKernel(const Context& dev_ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& input,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& paddding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           bool use_addto,
+                           int workspace_size_MB,
+                           bool exhaustive_search,
+                           DenseTensor* input_grad,
+                           DenseTensor* filter_grad) {
+  ConvCudnnGradKernel<T>(dev_ctx,
+                         out_grad,
+                         input,
+                         filter,
+                         strides,
+                         paddings,
+                         paddding_algorithm,
+                         groups,
+                         dilations,
+                         data_format,
+                         use_addto,
+                         workspace_size_MB,
+                         exhaustive_search,
+                         input_grad,
+                         filter_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+#endif
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..931b6d68845e27297784603c2427178eae6b6f7d
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -0,0 +1,476 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings_t,
+                     const std::string& padding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations_t,
+                     const std::string& data_format,
+                     bool use_addto,
+                     int workspace_size_MB,
+                     bool exhaustive_search_t,
+                     DenseTensor* output) {
+  output->mutable_data<T>(ctx.GetPlace());
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  // Tensor Core introduced from Volta GPUs supports more faster conv op
+  // with FP16 in NHWC data format.
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  // We will only do data format conversion from NHWC to NCHW.
+  // cudnn will convert NCHW to NHWC automatically on Tensor Core.
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+
+  // ------------ transformed tensor -----------
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_filter_channel(filter.type());
+  T* output_data = nullptr;
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, output, &transformed_output);
+
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output.ShareDataWith(*output);
+  }
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+  }
+  output_data = transformed_output.data<T>();
+
+  // update padding and dilation
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+
+  DenseTensor transformed_input;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+
+    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* input_data = transformed_input.data<T>();
+
+  const T* filter_data = transformed_filter_channel.data<T>();
+
+  // ------------------- cudnn descriptors ---------------------
+  paddle::operators::ConvArgs args{&transformed_input,
+                                   &transformed_filter_channel,
+                                   &transformed_output,
+                                   strides,
+                                   padding_common,
+                                   dilations,
+                                   dtype};
+
+  auto handle = ctx.cudnn_handle();
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
+
+  args.handle = handle;
+
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN need to set groups in cdesc in miopen_desc.h
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn(),
+                 groups);
+#else
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn());
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // cudnn 7 can support groups, no need to do it manually
+  // FIXME(typhoonzero): find a better way to disable groups
+  // rather than setting it to 1.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetConvolutionGroupCount(
+          args.cdesc.desc(), groups));
+  groups = 1;
+#endif
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN do not set groups in wdesc after set groups in cdesc
+  groups = 1;
+#endif
+  args.idesc.set(transformed_input, layout_format);
+  args.wdesc.set(transformed_filter_channel, layout_format, groups);
+  args.odesc.set(transformed_output, layout_format);
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+  // ------------------- cudnn conv workspace ---------------------
+  size_t workspace_size = 0;  // final workspace to allocate.
+// ------------------- cudnn conv algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t algo{};
+  using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+  workspace_size = search::GetWorkspaceSize(args);
+  algo = search::Find<T>(
+      args, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+  cudnnConvolutionFwdAlgo_t algo{};
+  using search =
+      paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+  algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
+  workspace_size = search::GetWorkspaceSize(args, algo);
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
+    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
+  // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
+    // FWD_ALGO_IMPLICIT_GEMM manually.
+  if (groups > 1) {
+    algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  }
+#endif
+
+  // ------------------- cudnn conv forward ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+
+#ifdef PADDLE_WITH_HIP
+  workspace_handle.RunFunc(
+      [&](void* workspace_ptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args.idesc.desc(),
+                input_data,
+                args.wdesc.desc(),
+                filter_data,
+                args.cdesc.desc(),
+                algo,
+                &beta,
+                args.odesc.desc(),
+                output_data,
+                workspace_ptr,
+                workspace_size));
+      },
+      workspace_size);
+#else
+  for (int i = 0; i < groups; i++) {
+    workspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::cudnnConvolutionForward(
+                  handle,
+                  &alpha,
+                  args.idesc.desc(),
+                  input_data + i * group_offset_in,
+                  args.wdesc.desc(),
+                  filter_data + i * group_offset_filter,
+                  args.cdesc.desc(),
+                  algo,
+                  workspace_ptr,
+                  workspace_size,
+                  &beta,
+                  args.odesc.desc(),
+                  output_data + i * group_offset_out));
+        },
+        workspace_size);
+  }
+#endif
+
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    TransToChannelLast<Context, T>(ctx, &transformed_output, output);
+  }
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& filter,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const std::string& padding_algorithm,
+                       int groups,
+                       const std::vector<int>& dilations,
+                       const std::string& data_format,
+                       bool use_addto,
+                       int workspace_size_MB,
+                       bool exhaustive_search,
+                       DenseTensor* out) {
+  ConvCudnnKernel<T>(dev_ctx,
+                     input,
+                     filter,
+                     strides,
+                     paddings,
+                     padding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+
+#endif
+
+// todo register bfloat16
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 45798b88bb58a3b088b2545f4a343c18ebec0ec4..0352fdf6fa2f1c1b74515d8e0023ef5a58e4efae 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
@@ -47,6 +49,11 @@ class VecT4<phi::dtype::float16> {
  public:
   using Type = int2;
 };
+template <>
+class VecT4<phi::dtype::bfloat16> {
+ public:
+  using Type = int2;
+};
 
 // Vectorization trait 2 * sizeof(T)
 template <typename T>
@@ -66,8 +73,13 @@ class VecT2<phi::dtype::float16> {
  public:
   using Type = int;
 };
+template <>
+class VecT2<phi::dtype::bfloat16> {
+ public:
+  using Type = int;
+};
 
-static inline int log2_ceil(int value) {
+static inline int Log2Ceil(int value) {
   int log2_value = 0;
   while ((1 << log2_value) < value) ++log2_value;
   return log2_value;
@@ -565,8 +577,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
 #else
   constexpr int max_num_threads = 1024;
 #endif
-  int block_x = 1 << log2_ceil(low_dim);
-  int block_y = 1 << log2_ceil(mid_dim);
+  int block_x = 1 << Log2Ceil(low_dim);
+  int block_y = 1 << Log2Ceil(mid_dim);
   block->x = std::min(block_x, 32);
   block->y = std::min(block_y, static_cast<int>(max_num_threads / block->x));
   block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
@@ -727,6 +739,131 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   }
 }
 
+static std::vector<int> GetSoftmaxTensorDims(const phi::DDim& dims,
+                                             const int axis) {
+  int dim = dims[axis];
+  int N = phi::funcs::SizeToAxis(axis, dims);
+  int D = phi::funcs::SizeOutAxis(axis, dims);
+  return {N, dim, D, 1};
+}
+
+template <typename T>
+void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
+                               const DenseTensor& x,
+                               const int axis,
+                               const bool log_mode,
+                               DenseTensor* out) {
+  auto* out_data = out->data<T>();
+
+  const int rank = x.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2(
+      handle,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data,
+      algo,
+      mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data));
+#endif
+}
+
+template <typename T>
+void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
+                                const DenseTensor& out,
+                                const DenseTensor& dout,
+                                const int axis,
+                                const bool log_mode,
+                                DenseTensor* dx) {
+  auto* dx_data = dx->data<T>();
+
+  int rank = out.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSoftmaxBackward_V2(
+          handle,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc,
+          out.data<T>(),
+          desc,
+          dout.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc,
+          dx_data,
+          algo,
+          mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      out.data<T>(),
+      desc,
+      dout.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      dx_data));
+#endif
+}
+
+template <typename T>
+static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
+  if (dev_ctx.cudnn_handle() != nullptr) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      return false;
+#endif
+    }
+    return true;
+  }
+  return false;
+}
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -734,29 +871,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     DenseTensor* out) {
   auto* out_data = out->data<T>();
 
-  auto dims = x.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = x.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = static_cast<int>(log2_ceil(dim));
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = static_cast<int>(Log2Ceil(dim));
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
 
     // use 128 threads per block to maximimize gpu utilization
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -771,7 +908,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
                                                threads,
@@ -781,7 +918,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else {
       SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
                                               threads,
@@ -791,78 +928,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                               N,
                                               dim,
                                               dim,
-                                              kDimLog2);
+                                              dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxForward<T, LogMode>(
         dev_ctx, out_data, x.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_LOG,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_ACCURATE,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    }
-#endif
+    SoftmaxForwardCudnnKernel<T>(dev_ctx, x, axis, LogMode, out);
   }
 }
 
@@ -874,27 +946,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                      DenseTensor* dx) {
   auto* dx_data = dx->data<T>();
 
-  auto dims = out.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = out.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = log2_ceil(dim);
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = Log2Ceil(dim);
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
+
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -909,7 +982,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
                                                 threads,
@@ -920,7 +993,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else {
       SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
                                                threads,
@@ -931,88 +1004,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxBackward<T, LogMode>(
         dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_LOG,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_ACCURATE,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    }
-#endif
+    SoftmaxBackwardCudnnKernel<T>(dev_ctx, out, dout, axis, LogMode, dx);
   }
 }
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
similarity index 80%
rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 56e5fef6e37e41dd6405af25c214013211670246..45ab645d3736734fb9ec4c6a7b949274c1f0a91e 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -38,7 +38,18 @@ PD_REGISTER_KERNEL(softmax_grad,
                    ALL_LAYOUT,
                    phi::SoftmaxGradGPUDNNKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
@@ -48,3 +59,4 @@ PD_REGISTER_KERNEL(softmax_grad,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
similarity index 79%
rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 427d1729a13a8ea8e0caf4aa534b012af76e79f2..7685c7dbb6894b4e640ea4b63010c4d22fc5e18f 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -37,7 +37,18 @@ PD_REGISTER_KERNEL(softmax,
                    ALL_LAYOUT,
                    phi::SoftmaxRawGPUDNNKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
@@ -47,3 +58,4 @@ PD_REGISTER_KERNEL(softmax,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d163e6e278a0755c63be3192ae749ddbac5262d4
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..95dbdc4443ad003c3b9e7e584976cbe5c0503e20
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
index 78c25200bbd284489ee431cdb78a81748565050b..9dad40b57c916c0670763802cbaf3fc89d49d0c0 100644
--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -47,7 +47,7 @@ void AbsGradKernel(const Context& ctx,
                    const DenseTensor& dout,
                    DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* x_data = x.data<T>();
 
   ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..80e23d2b8e24b875fcc03bc0c1c149c0c13e3e41
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradImpl(const Context& dev_ctx,
+                        const DenseTensor* X,
+                        const DenseTensor* Out,
+                        const DenseTensor* dOut,
+                        DenseTensor* dX,
+                        const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      dOut, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      dX, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!Out) {
+    Out = dOut;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = dX;
+  }
+
+  dev_ctx.template Alloc<T>(dX);
+  auto dout = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad"));
+  auto dx = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad"));
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place,
+            To32BitIndex(x),
+            To32BitIndex(out),
+            To32BitIndex(dout),
+            To32BitIndex(dx));
+  } else {
+    functor(*place, x, out, dout, dx);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+void ActivationDoubleGradImpl(const Context& dev_ctx,
+                              const DenseTensor* X,
+                              const DenseTensor* Out,
+                              const DenseTensor* ddX,
+                              DenseTensor* dX,
+                              DenseTensor* dOut,
+                              DenseTensor* ddOut,
+                              const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = ddX;
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    Out = ddX;
+  }
+
+  if (ddOut) {
+    dev_ctx.template Alloc<T>(ddOut);
+  }
+  if (dOut) {
+    dev_ctx.template Alloc<T>(dOut);
+  }
+  if (dX) {
+    dX->Resize(Out->dims());
+    dev_ctx.template Alloc<T>(dX);
+  }
+
+  functor(dev_ctx, X, Out, ddX, ddOut, dOut, dX);
+}
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout) {
+  funcs::ReluGradGradFunctor<T> relu_double_grad_functor;
+  ActivationDoubleGradImpl<T, Context, funcs::ReluGradGradFunctor<T>>(
+      dev_ctx,
+      nullptr,
+      &out,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      relu_double_grad_functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca3debd394a1e518b3a37d4d08e8a7f47a5670f2
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+#define ToString(x) #x
+
+template <typename T, typename Context, typename Functor>
+void ActivationImpl(const Context& dev_ctx,
+                    const DenseTensor& X,
+                    DenseTensor* Out,
+                    const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(Out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(Out);
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&X, "Input", "X", "Activation"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Output", "Out", "Activation"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place, To32BitIndex(x), To32BitIndex(out));
+  } else {
+    functor(*place, x, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fbdf435bab39fbded100656a591fc76ea2ca69b
--- /dev/null
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(avg_squared_grad_out);
+  dev_ctx.template Alloc<T>(avg_squared_update_out);
+
+  T rho_ = static_cast<T>(rho);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  // Squared gradient accumulator
+  auto eigen_avg_squared_grad = EigenVector<T>::Flatten(avg_squared_grad);
+  // Squared updates accumulator
+  auto eigen_avg_squared_update = EigenVector<T>::Flatten(avg_squared_update);
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_avg_squared_grad_out =
+      EigenVector<T>::Flatten(*avg_squared_grad_out);
+  auto eigen_avg_squared_update_out =
+      EigenVector<T>::Flatten(*avg_squared_update_out);
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_avg_squared_grad_out.device(place) =
+      rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square();
+  auto update = -((eigen_avg_squared_update + epsilon_) /
+                  (eigen_avg_squared_grad_out + epsilon_))
+                     .sqrt() *
+                eigen_grad;
+  eigen_avg_squared_update_out.device(place) =
+      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
+  eigen_param_out.device(place) = eigen_param + update;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adamax_kernel_impl.h b/paddle/phi/kernels/impl/adamax_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bff553319a2b98814d31a3f992b98541498de149
--- /dev/null
+++ b/paddle/phi/kernels/impl/adamax_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(moment_out);
+  dev_ctx.template Alloc<T>(inf_norm_out);
+
+  T beta1_ = static_cast<T>(beta1);
+  T beta2_ = static_cast<T>(beta2);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  auto eigen_moment = EigenVector<T>::Flatten(moment);
+  auto eigen_inf_norm = EigenVector<T>::Flatten(inf_norm);
+  auto eigen_lr = EigenVector<T>::Flatten(learning_rate);
+  auto eigen_beta1_pow = EigenVector<T>::Flatten(beta1_pow);
+
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_moment_out = EigenVector<T>::Flatten(*moment_out);
+  auto eigen_inf_norm_out = EigenVector<T>::Flatten(*inf_norm_out);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_moment_out.device(place) =
+      beta1_ * eigen_moment + (1 - beta1_) * eigen_grad;
+  eigen_inf_norm_out.device(place) =
+      eigen_grad.abs().cwiseMax((beta2_ * eigen_inf_norm) + epsilon_);
+  auto lr_t = eigen_lr / (1 - eigen_beta1_pow);
+  Eigen::DSizes<int, 1> m_dsize(moment_out->numel());
+  eigen_param_out.device(place) =
+      eigen_param -
+      lr_t.broadcast(m_dsize) * (eigen_moment_out / eigen_inf_norm_out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
index d0dd18298518ab351918aa2492eb48d11d3cf1d7..0eff1378f41de9b31a35375f86ca69a427d19f4f 100644
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index 2cae914e2f61555377f7a41b3d89cdbb2b589247..7653032f2113c6e181673c57feaec2efd6472838 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7167704a4824b74572bc0e0dd53d7a5c3dbd8c7
--- /dev/null
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_OUT_RANK_CASE(n)                                        \
+  case n: {                                                            \
+    ApplyBroadcast<T, Context, n>(ctx, in_tensors[i], out_tensors[i]); \
+    break;                                                             \
+  }
+
+namespace phi {
+
+template <typename T, typename Context, int OutRank>
+void ApplyBroadcast(const Context& ctx,
+                    const DenseTensor* input_tensor,
+                    DenseTensor* output_tensor) {
+  const auto& input_dims = input_tensor->dims();
+  const auto& output_dims = output_tensor->dims();
+
+  int in_rank = input_dims.size();
+  int out_rank = output_dims.size();
+
+  // 1. Collect bcast_dims, each element of which indicates how many
+  // times we need to replicate along the corresponding dimension
+  // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
+  // both input and output tensors, so we need to initialize input X with
+  // expanded dims: "new_input_dims_vec"
+  Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
+  std::vector<int64_t> new_input_dims_vec(out_rank);
+  for (int j = 0; j < out_rank; j++) {
+    int out_axis = out_rank - j - 1;
+    int in_axis = in_rank - j - 1;
+
+    bcast_dims[out_axis] = output_dims[out_axis];
+    new_input_dims_vec[out_axis] = 1;
+    if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
+      bcast_dims[out_axis] = 1;
+      new_input_dims_vec[out_axis] = input_dims[in_axis];
+    }
+  }
+  auto new_input_dims = phi::make_ddim(new_input_dims_vec);
+
+  // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
+  // output
+  auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
+
+  ctx.template Alloc<T>(output_tensor);
+  auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
+
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(
+      place, y, x, bcast_dims);
+}
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<const DenseTensor*>& x,
+                            std::vector<DenseTensor*> out) {
+  const auto& in_tensors = x;
+  auto out_tensors = out;
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs,but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // Eigen has no support for dynamic ranked tensor
+  // Thus we perform static expansion for each possible ranks
+  for (size_t i = 0; i < num_ins; i++) {
+    int out_rank = out_tensors[i]->dims().size();
+    switch (out_rank) {
+      SWITCH_OUT_RANK_CASE(1)
+      SWITCH_OUT_RANK_CASE(2)
+      SWITCH_OUT_RANK_CASE(3)
+      SWITCH_OUT_RANK_CASE(4)
+      SWITCH_OUT_RANK_CASE(5)
+      default: {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Target tensor rank out of range"
+            "Maximum supported rank for broadcast is: 5"));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f557e746378939e32a32955e758cdc5c510f229
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  // Tensor broadcast to temp 'y_bst'
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // reuse forward to calculate dx_bst, which is broad_cast of dx
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  CholeskySolveKernel<T, Context>(dev_ctx, dout, y_bst, upper, &dx_bst);
+
+  // get 'dx' according to 'dx_bst'
+  dx->Resize(x.dims());
+  dev_ctx.template Alloc<T>(dx);
+  if (dx_bst.dims() == x.dims()) {
+    Copy<Context>(dev_ctx, dx_bst, dev_ctx.GetPlace(), false, dx);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dx_bst, dx);
+    dx->Resize(x.dims());
+  }
+
+  // calculate out's conjugate for complex
+  DenseTensor out_conj = Conj<T, Context>(dev_ctx, out);
+  out_conj = phi::TransposeLast2Dim<T>(dev_ctx, out_conj);
+
+  DenseTensor commonterm = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(dx_bst,
+              phi::funcs::CreateMatrixDescriptor(dx_bst.dims(), 0, false),
+              out_conj,
+              phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false),
+              static_cast<T>(1),
+              &commonterm,
+              static_cast<T>(0));
+
+  // calculate commonterm's conjugate for complex
+  DenseTensor commonterm_conj = Conj<T, Context>(dev_ctx, commonterm);
+  commonterm_conj = phi::TransposeLast2Dim<T>(dev_ctx, commonterm_conj);
+
+  phi::AddRawKernel<T>(dev_ctx, commonterm, commonterm_conj, -1, &commonterm);
+
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  if (upper) {
+    blas.MatMul(y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  } else {
+    blas.MatMul(commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  }
+
+  // get upper or lower of 'dy_bst'
+  DenseTensor dy_bst_upper = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+
+  int y_bst_ndim = y_bst_dims_vec.size();
+  const auto H = y_bst_dims_vec[y_bst_ndim - 2];
+  const auto W = y_bst_dims_vec[y_bst_ndim - 1];
+  phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
+  paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+      dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
+  y_for_range(tril_triu_functor);
+
+  // get 'dy' according to 'dy_bst'
+  dy->Resize(y.dims());
+  dev_ctx.template Alloc<T>(dy);
+  if (dy_bst_upper.dims() == y.dims()) {
+    Copy<Context>(dev_ctx, dy_bst_upper, dev_ctx.GetPlace(), false, dy);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dy_bst_upper, dy);
+    dy->Resize(y.dims());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..16ceb776f1a98a0a7050d6cc9f3ec17be7bb617f
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+class CholeskySolveFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T* Adata,
+                  int lda,
+                  T* Bdata,
+                  int* devInfo);
+};
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // Tensor broadcast to temp 'x_bst' and 'y_bst'
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  // calculate y_bst's conjugate for complex
+  DenseTensor y_bst_conj = Conj<T, Context>(dev_ctx, y_bst);
+  y_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, y_bst_conj);
+  T* y_bst_conj_data = y_bst_conj.data<T>();
+
+  // calculate x_bst's conjugate for complex
+  DenseTensor x_bst_conj = Conj<T, Context>(dev_ctx, x_bst);
+  x_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, x_bst_conj);
+
+  // copy x_bst's conjugate to 'result'
+  DenseTensor result;
+  Copy<Context>(dev_ctx, x_bst_conj, dev_ctx.GetPlace(), false, &result);
+  T* res_data = result.data<T>();
+
+  // CPU use lapack, GPU use cusolver
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int M = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 2]);
+  int N = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 1]);
+  int batchsize = product(phi::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2));
+
+  DenseTensor info =
+      phi::Empty<int, Context>(dev_ctx, ScalarArray({batchsize}));
+  int* info_data = info.data<int>();
+
+  CholeskySolveFunctor<T, Context> functor;
+  for (int i = 0; i < batchsize; ++i) {
+    functor(dev_ctx,
+            upper,
+            M,
+            N,
+            y_bst_conj_data + i * M * M,
+            std::max(1, M),
+            res_data + i * M * N,
+            info_data + i);
+  }
+
+  // calculate out's conjugate for complex
+  result = phi::TransposeLast2Dim<T>(dev_ctx, result);
+  out->Resize(phi::make_ddim(x_bst_dims_vec));
+  ConjKernel<T, Context>(dev_ctx, result, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4390c1f8e661c2644b2f71d5bef1dbfea9af487f
--- /dev/null
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/compare_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+
+namespace phi {
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out);
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out);
+
+#define DEFINE_COMPARE_KERNEL(compare_kernel, functor, inverse_functor) \
+  template <typename T, typename Context>                               \
+  void compare_kernel(const Context& ctx,                               \
+                      const DenseTensor& x,                             \
+                      const DenseTensor& y,                             \
+                      int axis,                                         \
+                      DenseTensor* out) {                               \
+    CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>(      \
+        ctx, x, y, axis, out);                                          \
+  }
+
+DEFINE_COMPARE_KERNEL(LessThanKernel,
+                      funcs::LessThanFunctor,
+                      funcs::GreaterThanFunctor)
+DEFINE_COMPARE_KERNEL(LessEqualKernel,
+                      funcs::LessEqualFunctor,
+                      funcs::GreaterEqualFunctor)
+DEFINE_COMPARE_KERNEL(GreaterThanKernel,
+                      funcs::GreaterThanFunctor,
+                      funcs::LessThanFunctor)
+DEFINE_COMPARE_KERNEL(GreaterEqualKernel,
+                      funcs::GreaterEqualFunctor,
+                      funcs::LessEqualFunctor)
+DEFINE_COMPARE_KERNEL(EqualKernel, funcs::EqualFunctor, funcs::EqualFunctor)
+DEFINE_COMPARE_KERNEL(NotEqualKernel,
+                      funcs::NotEqualFunctor,
+                      funcs::NotEqualFunctor)
+#undef DEFINE_COMPARE_KERNEL
+
+#define DEFINE_COMPARE_ALL_KERNEL(compare_all_kernel, functor)    \
+  template <typename T, typename Context>                         \
+  void compare_all_kernel(const Context& ctx,                     \
+                          const DenseTensor& x,                   \
+                          const DenseTensor& y,                   \
+                          DenseTensor* out) {                     \
+    CompareAllKernelImpl<T, Context, functor<T>>(ctx, x, y, out); \
+  }
+
+DEFINE_COMPARE_ALL_KERNEL(EqualAllKernel, funcs::EqualFunctor)
+#undef DEFINE_COMPARE_ALL_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
index a10481284b17fbc21865ab8aa3b5ebad4e0a7d95..03896a2353dda5b89876d46115eea59fa907a8ac 100644
--- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -24,7 +24,7 @@ void RealGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
@@ -38,7 +38,7 @@ void ImagGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index ff5cf86ed2ea240747f70f4410b339a135a49d3a..72b13288339797ae5dd6b21862ed403dd1719a65 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -39,8 +39,8 @@ void RealKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
-      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+  auto* out_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
   phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
@@ -53,8 +53,8 @@ void ImagKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
-      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+  auto* out_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
   phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..93bc5b64adc170901aeffeadfa64d6b5d7ea8c60
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+DECLARE_bool(cudnn_deterministic);
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+
+namespace phi {
+
+static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) {
+  return dev_ctx.GetComputeCapability() >= 70;
+}
+
+// inline cudnnTensorFormat_t GetCudnnTensorFormat(
+//     const phi::DataLayout& order) {  // Not use
+//   switch (order) {
+//     case phi::DataLayout::kNHWC:
+//       return CUDNN_TENSOR_NHWC;
+//     case phi::DataLayout::kNCHW:
+//       return CUDNN_TENSOR_NCHW;
+//     case phi::DataLayout::NCDHW:
+//       return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+//     case phi::DataLayout::NDHWC:
+//       return CUDNN_TENSOR_NHWC;  // add, liyamei
+//     default:
+//       PADDLE_THROW(phi::errors::Unimplemented(
+//           "CUDNN has no equivalent dataLayout for input order."));
+//   }
+//   return CUDNN_TENSOR_NCHW;
+// }
+
+static inline void GetNCDHW(const DDim& dims,
+                            const phi::DataLayout& layout,
+                            int* N,
+                            int* C,
+                            int* D,
+                            int* H,
+                            int* W) {
+  *N = dims[0];
+  *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+  int i = layout == phi::DataLayout::kNCHW ? 0 : 1;
+  if (dims.size() == 5) {
+    *D = dims[2 - i];
+    *H = dims[3 - i];
+    *W = dims[4 - i];
+  } else {
+    *D = 1;
+    *H = dims[2 - i];
+    *W = dims[3 - i];
+  }
+}
+
+}  // namespace phi
+
+// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double
+// ) {}
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbcebf371a61bd3d652888b5eaad56185499726b
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -0,0 +1,330 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides_t,
+                        const std::vector<int>& paddings_t,
+                        const std::string& padding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations_t,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad) {
+  const DenseTensor* X = &input;
+  const DenseTensor* dY = &out_grad;
+  const DenseTensor* ddX = input_grad_grad.get_ptr();
+  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
+
+  DenseTensor* ddY = out_grad_grad;
+  DenseTensor* dW = filter_grad;
+  DenseTensor* dX = input_grad;
+  DenseTensor W = filter;
+
+  if (!ddY && !dW && !dX) return;
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensor
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_dY(dY->type());
+  DenseTensor transformed_ddX(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+    }
+  } else {
+    transformed_X = *X;
+    transformed_dY = *dY;
+    if (ddX) {
+      transformed_ddX = *ddX;
+    }
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_X.dims();
+  auto filter_dims = W.dims();
+
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
+  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  // col_shape [in_channel/group, kh, kw, oh, ow]
+  col_shape_vec[0] = transformed_X.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+  // input_shape [Cin, H, W]
+  DDim input_shape =
+      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
+  // filter_matrix_shape [Cout, Cin * kh * kw]
+  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
+
+  W.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_dY.dims()[1],
+      transformed_dY.numel() /
+          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
+  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  // dx convolution double grad:  gemm + col2im(col2vol)
+  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
+  // oH, oW)
+  if (dX && ddW_in) {
+    Tensor ddW;
+    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+    dX->mutable_data<T>(dev_ctx.GetPlace());
+
+    DenseTensor transformed_dX(dX->type());
+
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
+
+    } else {
+      transformed_dX = *dX;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary
+    // because math::matmul will reset dx
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col_matrix.ShareDataWith(dx_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(
+            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &dx_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
+    }
+  }
+
+  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
+  // oH, oW)
+  // dw convolution double grad:  im2col(vol2col) + gemm
+  if (dW && ddX) {
+    dW->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, dW, static_cast<T>(0));
+    DenseTensor dW_arr = *dW;
+    dW_arr.Resize(filter_matrix_shape);
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; ++g) {
+        // im2col
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col.ShareDataWith(ddx_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 ddx_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+        }
+
+        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(
+            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
+      }
+    }
+  }
+
+  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
+  // w/ddw(Cout, Cin, kh, kw)
+  // ddy convolution double grad: im2col(vol2col) + gemm
+  if (ddY) {
+    ddY->mutable_data<T>(dev_ctx.GetPlace());
+
+    DenseTensor transformed_ddY(ddY->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
+    } else {
+      transformed_ddY = *ddY;
+    }
+
+    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor ddy_batch =
+          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
+      for (int g = 0; g < groups; ++g) {
+        // gemm
+        DenseTensor ddy_slice =
+            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        if (ddX) {
+          DenseTensor ddx_batch =
+              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor ddx_slice =
+              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (!is_expand) {
+            col.ShareDataWith(ddx_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   ddx_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+          }
+          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
+        }
+
+        if (ddW_in) {
+          DenseTensor x_batch =
+              transformed_X.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          DenseTensor ddW;
+          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+          if (!is_expand) {
+            col.ShareDataWith(x_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   x_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
+          }
+
+          // gemm
+          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1971aca800b59171a2e741dbebce6d8adaf7899
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -0,0 +1,257 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& output_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter_t,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings_t,
+                    const std::string& padding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations_t,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad) {
+  // The filter and filter_grad will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+
+  if (!input_grad && !filter_grad) return;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  DenseTensor filter = filter_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output_grad(output_grad.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+
+    ResizeToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+    TransToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+  } else {
+    transformed_input = input;
+    transformed_output_grad = output_grad;
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation<int>(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(
+      vectorize(transformed_output_grad.dims()));
+
+  // use col_shape in the im2col calculation
+  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+  // o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = transformed_input.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (i_c/g * k_h * k_w, o_h * o_w)
+  // or
+  // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  DDim input_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+
+  DDim output_matrix_shape = {
+      transformed_output_grad.dims()[1],
+      transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
+                                         transformed_output_grad.dims()[1])};
+
+  // convolution backward input operator:  gemm + col2im(or col2vol)
+  // convolution backward weight operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    DenseTensor transformed_input_grad(input_grad->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, input_grad, &transformed_input_grad);
+
+    } else {
+      transformed_input_grad = *input_grad;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary,
+    // because math::matmul will reset input_grad.
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_grad_batch =
+          transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor filter_slice =
+            filter.Slice(g * out_step, (g + 1) * out_step);
+
+        DenseTensor in_grad_slice =
+            in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col_matrix.ShareDataWith(in_grad_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(filter_slice,
+                    true,
+                    out_grad_slice,
+                    false,
+                    T(1.0),
+                    &col_matrix,
+                    T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &in_grad_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_input_grad, input_grad);
+    }
+  }
+
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    Tensor filter_grad_ = *filter_grad;
+    filter_grad_.Resize(filter_matrix_shape);
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_batch =
+          transformed_input.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // im2col
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 in_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+        }
+
+        // gemm
+        DenseTensor filter_grad_slice =
+            filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(out_grad_slice,
+                    false,
+                    col_matrix,
+                    true,
+                    T(1.0),
+                    &filter_grad_slice,
+                    T(1.0));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1945468f02551b8e348687ae578c9f23a038b8ca
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter_t,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings_t,
+                const std::string& padding_algorithm,
+                int groups,
+                const std::vector<int>& dilations_t,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* output) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  DenseTensor filter = filter_t;
+  // The filter will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  output->mutable_data<T>(dev_ctx.GetPlace());
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output(output->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, output, &transformed_output);
+
+  } else {
+    transformed_input = input;
+    transformed_output = *output;
+  }
+
+  // update padding and dilation
+  auto trans_in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+  // filter_shape_vec:
+  // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+
+  // output_shape_vec:
+  // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_output.dims()));
+
+  // use col_shape in the im2col calculation
+  // col_shape_vec:
+  // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
+  // o_d,o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = trans_in_dims[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size:
+  // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
+  // o_w)
+
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    // col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  DDim in_matrix_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+
+  DDim output_matrix_shape = {
+      transformed_output.dims()[1],
+      transformed_output.numel() /
+          (transformed_output.dims()[0] * transformed_output.dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
+
+  paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+  paddle::operators::math::
+      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          im2col;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor in_batch =
+        transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
+    DenseTensor out_batch =
+        transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        im2col(dev_ctx,
+               in_slice,
+               dilations,
+               strides,
+               std::vector<int>{
+                   paddings[0], paddings[2], paddings[1], paddings[3]},
+               &col);
+
+      } else if (data_dim == 3U) {
+        vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      blas.MatMul(
+          filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0));
+    }
+  }
+  if (channel_last) {
+    TransToChannelLast<Context, T>(dev_ctx, &transformed_output, output);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index 74ded1569eb5804950898bc1b824367b56480cda..92550de1800e11481cc7d7d6323e1fd2d2c28818 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -38,8 +38,8 @@ struct DigammaGradFunctor {
 
 template <typename T, typename Context>
 void DigammaGradKernel(const Context& ctx,
-                       const DenseTensor& out_grad,
                        const DenseTensor& x,
+                       const DenseTensor& out_grad,
                        DenseTensor* x_grad) {
   x_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc118a832dc9f20ea4dba1b08bf845e463bd4135
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
@@ -0,0 +1,223 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistGradFunction(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             float p,
+                             DenseTensor* x_grad,
+                             DenseTensor* y_grad) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto out_dims = out.dims();
+
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+  phi::DDim out_new_dims = GetNewDims(out_dims, Rank);
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, Rank>::From(out, out_new_dims);
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  Eigen::DSizes<int, Rank> out_bcast_dims;
+
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  std::vector<int64_t> new_dims_vec(Rank);
+  for (int i = 0; i < Rank; ++i) {
+    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
+    out_bcast_dims[i] = new_dims_vec[i];
+  }
+  phi::DDim new_dims = phi::make_ddim(new_dims_vec);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto out_grad_t = ETensor<T, Rank>::From(out_grad, out_new_dims);
+  DenseTensor grad;
+  grad.Resize(new_dims);
+  dev_ctx.template Alloc<T>(&grad);
+  auto grad_t = ETensor<T, Rank>::From(grad);
+
+  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
+  auto x_minux_y_abs = x_minux_y.abs();
+  auto sign =
+      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
+      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
+
+  // 1: Lp-norm(z), z = x-y, compute dz
+  if (p == 0) {
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, &grad, static_cast<T>(0));
+  } else if (p == INFINITY || p == -INFINITY) {
+    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
+    // j!=i, or equals to sign(z_i) * dout if j=i.
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  } else {
+    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  }
+
+  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
+  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
+  Eigen::DSizes<int, Rank> reduce_dims;
+  for (int i = 0; i < x_new_dims.size(); ++i) {
+    x_reshape_dims[2 * i] = x_bcast_dims[i];
+    x_reshape_dims[2 * i + 1] = x_new_dims[i];
+    y_reshape_dims[2 * i] = y_bcast_dims[i];
+    y_reshape_dims[2 * i + 1] = y_new_dims[i];
+    reduce_dims[i] = 2 * i;
+  }
+
+  // 2: if x or y is broadcasted in forward function,
+  // the grad need to be sum along the broadcasted dimensions
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    auto x_grad_t = ETensor<T, Rank>::From(*x_grad, x_new_dims);
+    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(x_grad_t.dimensions());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    auto y_grad_t = ETensor<T, Rank>::From(*y_grad, y_new_dims);
+    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
+                                  .sum(reduce_dims)
+                                  .reshape(y_grad_t.dimensions());
+  }
+}
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistGradFunction<Context, T, 1>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 2:
+      DistGradFunction<Context, T, 2>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 3:
+      DistGradFunction<Context, T, 3>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 4:
+      DistGradFunction<Context, T, 4>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 5:
+      DistGradFunction<Context, T, 5>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 6:
+      DistGradFunction<Context, T, 6>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..397fc1b9224339b55304620ea9cf0dded570655f
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_kernel_impl.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistFunction(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         float p,
+                         DenseTensor* out) {
+  if (out) {
+    dev_ctx.template Alloc<T>(out);
+  }
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, 1>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  // p=0 means number of non-zero elements of (x-y)
+  // p=inf means the maximum of |x-y|
+  // p=-inf means the minimum of |x-y|
+  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
+  if (p == 0) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
+            .template cast<T>()
+            .sum();
+  } else if (p == INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .maximum();
+  } else if (p == -INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .minimum();
+  } else {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .pow(p)
+            .sum()
+            .pow(1.0 / p);
+  }
+}
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistFunction<Context, T, 1>(dev_ctx, x, y, p, out);
+      break;
+    case 2:
+      DistFunction<Context, T, 2>(dev_ctx, x, y, p, out);
+      break;
+    case 3:
+      DistFunction<Context, T, 3>(dev_ctx, x, y, p, out);
+      break;
+    case 4:
+      DistFunction<Context, T, 4>(dev_ctx, x, y, p, out);
+      break;
+    case 5:
+      DistFunction<Context, T, 5>(dev_ctx, x, y, p, out);
+      break;
+    case 6:
+      DistFunction<Context, T, 6>(dev_ctx, x, y, p, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f0530b638f5ea3d263f7c2b1a932a65ccaf3da2
--- /dev/null
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  auto& dims = out_v.dims();
+  const int m = dims[dims.size() - 1];
+  DenseTensor tV =
+      phi::TransposeLast2Dim<T>(dev_ctx, phi::Conj<T>(dev_ctx, out_v));
+  DenseTensor W =
+      phi::Subtract<phi::dtype::Real<T>>(dev_ctx,
+                                         phi::funcs::Unsqueeze(out_w, -2),
+                                         phi::funcs::Unsqueeze(out_w, -1));
+  DenseTensor result = phi::Matmul<T>(dev_ctx, tV, dout_v);
+  result.Resize(dims);
+  dev_ctx.template Alloc<T>(&result);
+
+  std::vector<int> out_shape = phi::vectorize<int>(dims);
+  DenseTensor constant;
+  constant.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&constant);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, &constant, T(0.5));
+  result = phi::Subtract<T>(
+      dev_ctx,
+      result,
+      phi::Conj<T>(dev_ctx, phi::TransposeLast2Dim<T>(dev_ctx, result)));
+  result = phi::Multiply<T>(dev_ctx, result, constant);
+  if (result.type() != W.type()) {
+    auto x_vector = EigenVector<T>::Flatten(result);
+    auto y_vector = EigenVector<phi::dtype::Real<T>>::Flatten(W);
+    auto out_vector = EigenVector<T>::Flatten(result);
+    auto& place = *dev_ctx.eigen_device();
+    out_vector.device(place) = x_vector / y_vector;
+  } else {
+    result = phi::Divide<T>(dev_ctx, result, W);
+  }
+  result = phi::funcs::DiagFill<T, phi::dtype::Real<T>>(
+      dev_ctx, m, m, m, 0, dout_w, result);
+  *dx = phi::Matmul<T>(dev_ctx, out_v, phi::Matmul<T>(dev_ctx, result, tV));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 460e74b58166a5132bdbd62703f4dc3d5ef34a91..65427e87506f70549c81acec714ce2f5ebdfc9b8 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace phi {
@@ -47,19 +50,14 @@ void AddGradImpl(const Context& dev_ctx,
   }
 }
 
-template <typename T,
-          typename Context,
-          typename GradFunc,
-          typename GradInverseFunc>
+template <typename T, typename Context>
 void AddDoubleGradImpl(const Context& dev_ctx,
                        const DenseTensor& y,
                        const paddle::optional<const DenseTensor&>& ddx,
                        const paddle::optional<const DenseTensor&>& ddy,
                        const DenseTensor& dout,
                        int axis,
-                       DenseTensor* ddout,
-                       GradFunc grad_func,
-                       GradInverseFunc grad_inverse_func) {
+                       DenseTensor* ddout) {
   // ddOut = ddx + ddy
   if (ddout) {
     DenseTensor ddx_safe, ddy_safe;
@@ -72,28 +70,28 @@ void AddDoubleGradImpl(const Context& dev_ctx,
     auto ddx_dims = ddx_safe.dims();
     auto ddy_dims = ddy_safe.dims();
     if (ddx_dims.size() >= ddy_dims.size()) {
-      grad_func(
+      funcs::ElementwiseCompute<funcs::AddFunctor<T>, T>(
           dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor<T>(), ddout);
     } else {
-      grad_inverse_func(dev_ctx,
-                        ddx_safe,
-                        ddy_safe,
-                        axis,
-                        funcs::InverseAddFunctor<T>(),
-                        ddout);
+      funcs::ElementwiseCompute<funcs::InverseAddFunctor<T>, T>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          axis,
+          funcs::InverseAddFunctor<T>(),
+          ddout);
     }
   }
 }
 
-template <typename T, typename Context, typename GradFunc>
+template <typename T, typename Context>
 void SubtractDoubleGradImpl(const Context& dev_ctx,
                             const DenseTensor& y,
                             const paddle::optional<const DenseTensor&>& ddx,
                             const paddle::optional<const DenseTensor&>& ddy,
                             const DenseTensor& dout,
                             int axis,
-                            DenseTensor* ddout,
-                            GradFunc grad_func) {
+                            DenseTensor* ddout) {
   // DDOut = ddx - ddy
   if (ddout) {
     DenseTensor ddx_safe, ddy_safe;
@@ -103,9 +101,435 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
         dev_ctx, y, ddy.get_ptr(), &ddy_safe);
 
     ddout->mutable_data<T>(dev_ctx.GetPlace());
-    grad_func(
+    funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
         dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
   }
 }
 
+/*
+******************************
+    Divide Grad
+******************************
+*/
+
+template <typename T>
+struct DivGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
+};
+
+template <typename T>
+struct DivGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout / y_conj;
+  }
+};
+
+template <typename T>
+struct DivGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return -dout * out / y;
+  }
+};
+
+template <typename T>
+struct DivGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> out_div_y_conj((out / y).real, -(out / y).imag);
+    return -dout * out_div_y_conj;
+  }
+};
+
+template <typename T>
+struct DivDoubleDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return y * out * dout - x * dout;
+  }
+};
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout) {
+  if (dy) {
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+  }
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  // ddX_safe == null ? 0 : ddX
+  // ddY_safe == null ? 0 : ddY
+  DenseTensor ddX_safe, ddY_safe;
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddY_safe);
+
+  // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+  // dY = Out * dX * ddY / Y - dX * ddX / Y
+  // dOut = - dX * ddY
+  // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
+  // inplace ddx
+  DenseTensor tmp;
+  if (dout) {
+    tmp = *dout;
+  } else {
+    tmp.Resize(out.dims());
+    dev_ctx.template Alloc<T>(&tmp);
+  }
+  if (dy) {
+    // dX_div_Y = dX / Y;
+    DenseTensor dX_div_Y = tmp;
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, dx, y, &dX_div_Y, axis);
+
+    // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
+    // first output tensor is nullptr, the branch to calculate first
+    // output tensor will not be activated, DivGradDx function will not
+    // be called and can be ignored, the first branch has little effect
+    // on running speed.
+
+    // dY = Out * dX * ddY / Y - dX * ddX / Y
+    phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
+        dev_ctx,
+        ddX_safe,
+        ddY_safe,
+        out,
+        dX_div_Y,
+        axis,
+        nullptr,
+        dy,
+        DivGradDX<T>(),
+        DivDoubleDY<T>());
+  }
+
+  if (ddout) {
+    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, out, ddY_safe, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::SubtractFunctor<T>,
+                                      funcs::InverseSubtractFunctor<T>>(
+        dev_ctx, ddX_safe, tmp, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, tmp, y, ddout, axis);
+  }
+
+  if (dout) {
+    // dOut = - dX * ddY
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dx, ddY_safe, dout, axis);
+    auto& place = *dev_ctx.eigen_device();
+    auto dout_result = phi::EigenVector<T>::Flatten(*dout);
+    dout_result.device(place) = static_cast<T>(-1) * dout_result;
+  }
+}
+
+template <typename T>
+struct MulGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
+};
+
+template <typename T>
+struct MulGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
+
+/*
+******************************
+    Multiply Grad
+******************************
+*/
+
+template <typename T>
+struct MulGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
+};
+
+template <typename T>
+struct MulGradDY<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout) {
+  if (ddout) dev_ctx.template Alloc<T>(ddout);
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  // dx = dout * ddy
+  // dy = dout * ddx
+  // ddout = ddx * y + x * ddy
+  // change computation sequence to save memory, so ddout can inplace ddx and
+  // dx can be used as 'tmp' tensor
+  // (1) dx = x * ddy
+  // (2) dy = dout * ddx
+  // (3) ddout = ddx * y
+  // (4) ddout = ddout + dx
+  // (5) dx = dout * ddy
+  if (ddout) {
+    auto& place = *dev_ctx.eigen_device();
+    // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
+    if (ddout->numel() > ddx.get_ptr()->numel()) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+
+      DenseTensor ddout_tmp;
+      ddout_tmp.Resize(ddout->dims());
+      dev_ctx.template Alloc<T>(&ddout_tmp);
+
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, y, ddx_safe, ddout, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, x, &ddout_tmp, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+    } else {
+      // use dx to save memory, other than alloc tmp tensor
+      DenseTensor* ddout_tmp = dx;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, x, ddy_safe, ddout_tmp, axis);
+      // NOTE: in the following ElemwiseGradCompute, for the
+      // first output tensor is nullptr, the branch to calculate first
+      // output tensor will not be activated, DivGradDx function will not
+      // be called and can be ignored, the first branch has little effect
+      // on running speed.
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          nullptr,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, y, ddout, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(*ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, dout, ddy_safe, dx, axis);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy) {
+  if (d_x) {
+    d_x->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_x);
+  }
+  if (d_y) {
+    d_y->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_y);
+  }
+  if (d_dout) {
+    d_dout->Resize(dout.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_ddx) {
+    d_ddx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  if (d_ddy) {
+    d_ddy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_ddy);
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  if (d_ddout.get_ptr()) {
+    if (d_x) {
+      // d_x = ddy * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, *(d_ddout.get_ptr()), d_x, axis);
+    }
+    if (d_y) {
+      // d_y = ddx * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, *(d_ddout.get_ptr()), d_y, axis);
+    }
+  }
+
+  if (d_dout) {
+    // get d_dout
+    // d_dout = ddy * d_dx + d_dy * ddx
+    DenseTensor d_dout_tmp;
+    d_dout_tmp.Resize(dout.dims());
+    dev_ctx.template Alloc<T>(&d_dout_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, d_dy, ddx_safe, d_dout, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, ddy_safe, d_dx, &d_dout_tmp, axis);
+    auto d_dout_t = phi::EigenVector<T>::Flatten(*d_dout);
+    auto d_dout_tmp_t = phi::EigenVector<T>::Flatten(d_dout_tmp);
+    d_dout_t.device(place) = d_dout_t + d_dout_tmp_t;
+  }
+
+  if (d_ddx) {
+    // get d_ddx
+    // d_ddx = dout * d_dy + y * d_ddout
+    DenseTensor d_ddx_tmp;
+    d_ddx_tmp.Resize(ddx->dims());
+    dev_ctx.template Alloc<T>(&d_ddx_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dy, d_ddx, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, y, *(d_ddout.get_ptr()), &d_ddx_tmp, axis);
+    auto d_ddx_t = phi::EigenVector<T>::Flatten(*d_ddx);
+    auto d_ddx_tmp_t = phi::EigenVector<T>::Flatten(d_ddx_tmp);
+    d_ddx_t.device(place) = d_ddx_t + d_ddx_tmp_t;
+  }
+
+  if (d_ddy) {
+    // get d_ddy
+    // d_ddy = dout * d_dx + x * d_ddout
+    DenseTensor d_ddy_tmp;
+    d_ddy_tmp.Resize(ddy->dims());
+    dev_ctx.template Alloc<T>(&d_ddy_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dx, d_ddy, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, x, *(d_ddout.get_ptr()), &d_ddy_tmp, axis);
+    auto d_ddy_t = phi::EigenVector<T>::Flatten(*d_ddy);
+    auto d_ddy_tmp_t = phi::EigenVector<T>::Flatten(d_ddy_tmp);
+    d_ddy_t.device(place) = d_ddy_t + d_ddy_tmp_t;
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_grad_kernel_impl.h b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5908d9d7dcb50d617d6796f26e2e8793d06b8409
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_dx, eigen_x, eigen_dout);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_kernel_impl.h b/paddle/phi/kernels/impl/erf_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa1f4d349ab71ee0b19a3a2d3ad3d3a47553464c
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErf<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_out, eigen_in);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ef282d470333e8099b668e5dd7d2e4c68beff3e
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+namespace phi {
+template <typename Context, typename T, int Dims>
+void ExpandAsBackward(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec,
+                      DenseTensor* in_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  ctx.template Alloc<T>(in_grad);
+  auto x_grad = EigenVector<T>::Flatten(*in_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+  auto out_grad0 = EigenVector<T>::Flatten(out_grad);
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, x_grad, out_grad0, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& context,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad) {
+  auto x_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(x_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    repeat_times[i] = target_shape[i] / vec_in_dims[i];
+  }
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times[i]);
+    reshape_dims_vec.push_back(vec_in_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times.size(); i++) {
+    if (repeat_times[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    context.template Alloc<T>(in_grad);
+    phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
+  } else {
+    PADDLE_ENFORCE_GE(
+        dims,
+        1,
+        errors::InvalidArgument("The rank of the input 'Out@GRAD' for "
+                                "expand_as_v2_grad op must be greater than or "
+                                "equal to 1, but the value received is %d.",
+                                dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for "
+                          "expand_as_v2_grad op must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        ExpandAsBackward<Context, T, 1>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 2:
+        ExpandAsBackward<Context, T, 2>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 3:
+        ExpandAsBackward<Context, T, 3>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 4:
+        ExpandAsBackward<Context, T, 4>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 5:
+        ExpandAsBackward<Context, T, 5>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 6:
+        ExpandAsBackward<Context, T, 6>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5138e4e12c05403ceb47371e03e9cbe207bf9a4
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void ExpandAs(const Context& context,
+              const DenseTensor& x,
+              const std::vector<int>& target_shape,
+              DenseTensor* out) {
+  auto in_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        target_shape[i],
+        0,
+        errors::InvalidArgument("The value of target shape cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          target_shape[i],
+          0,
+          errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand_as_v2 op.",
+              target_shape[i]));
+      repeat_times[i] = target_shape[i];
+    } else if (target_shape[i] > 0) {
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i],
+            target_shape[i],
+            errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand_as_v2 op.",
+                vec_in_dims[i],
+                target_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = target_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          target_shape[i],
+          -1,
+          errors::InvalidArgument(
+              "When the value in shape is negative for expand_as_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              target_shape[i]));
+      repeat_times[i] = 1;
+    }
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  phi::DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  phi::DDim out_dims = phi::make_ddim(target_shape);
+
+  out->Resize(out_dims);
+  context.template Alloc<T>(out);
+  auto x0 = EigenTensor<T, Rank>::From(x, new_in_dims);
+  auto y = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *context.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+      place, y, x0, bcast_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto target_rank = target_shape.size();
+  PADDLE_ENFORCE_GE(target_rank,
+                    rank,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be greater than or equal to "
+                        "the rank (%d) of the input 'x'.",
+                        target_rank,
+                        rank));
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                              "expand_as_v2 op must be positive.",
+                              rank));
+  PADDLE_ENFORCE_LE(target_rank,
+                    MAX_RANK_SUPPORTED,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be less than or equal to %d.",
+                        target_rank,
+                        MAX_RANK_SUPPORTED));
+
+  switch (target_rank) {
+    case 1:
+      ExpandAs<Context, T, 1>(ctx, x, target_shape, out);
+      break;
+    case 2:
+      ExpandAs<Context, T, 2>(ctx, x, target_shape, out);
+      break;
+    case 3:
+      ExpandAs<Context, T, 3>(ctx, x, target_shape, out);
+      break;
+    case 4:
+      ExpandAs<Context, T, 4>(ctx, x, target_shape, out);
+      break;
+    case 5:
+      ExpandAs<Context, T, 5>(ctx, x, target_shape, out);
+      break;
+    case 6:
+      ExpandAs<Context, T, 6>(ctx, x, target_shape, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h
index 453652273a25b2140376712a673713b2f9fbe12b..f4041f921fd352e63688b24d77987eabd526d4a2 100644
--- a/paddle/phi/kernels/impl/eye_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eye_kernel_impl.h
@@ -36,7 +36,7 @@ template <typename T, typename Context>
 void EyeKernel(const Context& ctx,
                int64_t num_rows,
                int64_t num_columns,
-               int dtype,
+               DataType dtype,
                DenseTensor* out) {
   auto num = num_columns;
   if (num == -1) {
diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..affa85f8a2d28e58335b4dbf6d42114cf47d9594
--- /dev/null
+++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out);
+
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor)            \
+  template <typename T, typename Context>                           \
+  void isfinite_kernel(                                             \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out) { \
+    IsfiniteKernelImpl<T, Context, functor>(ctx, x, out);           \
+  }
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f84133d5f483c56d6ba83805b2d2b1499f76712
--- /dev/null
+++ b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       float epsilon,
+                       DenseTensor* in_grad) {
+  auto prediction = EigenVector<T>::Flatten(input);
+  auto label_out = EigenVector<T>::Flatten(label);
+
+  auto dl = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+
+  if (in_grad) {
+    dev_ctx.template Alloc<T>(in_grad);
+    auto dx = EigenVector<T>::Flatten(*in_grad);
+    phi::funcs::EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+        place, dx, dl, prediction, label_out, epsilon);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/log_loss_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d49144c83549ece418f2b2014e27586b17419d28
--- /dev/null
+++ b/paddle/phi/kernels/impl/log_loss_kernel_impl.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   float epsilon,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto prediction = EigenVector<T>::Flatten(input);
+  auto label_out = EigenVector<T>::Flatten(label);
+
+  auto loss = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  phi::funcs::EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+      place, loss, prediction, label_out, epsilon);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index f2549c171dda00ecab0baf8b6a7cdfb26ddea4d0..495b93f2a4ef0f790d53605e4531af7040c6b2ad 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   const DenseTensor& input,
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
-    auto stream = dev_ctx.stream();
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims);
   }
 };
 #endif
@@ -329,8 +328,8 @@ void MatmulGradKernel(const Context& dev_ctx,
     x_conj = Conj<T>(dev_ctx, x);
     y_conj = Conj<T>(dev_ctx, y);
 
-    DenseTensor dx_help = Empty<T, Context>(dev_ctx);
-    DenseTensor dy_help = Empty<T, Context>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -686,8 +685,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
       y_conj = Conj<T>(dev_ctx, y);
     }
 
-    DenseTensor dx_help = Empty<T>(dev_ctx);
-    DenseTensor dy_help = Empty<T>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -1373,10 +1372,10 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
     VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
                "wastes the memory. So we should avoid the case in reality";
 
-    DenseTensor out_dx_help = Empty<T>(dev_ctx);
-    DenseTensor out_dy_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddx_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddy_help = Empty<T>(dev_ctx);
+    DenseTensor out_dx_help;
+    DenseTensor out_dy_help;
+    DenseTensor out_d_ddx_help;
+    DenseTensor out_d_ddy_help;
 
     if (out_d_dout) {
       ddx_conj = Conj<T>(dev_ctx, ddx);
diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e797b27071cacae200e3746032b4086bb1c3ae45
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+void MatrixPowerGradFunction(const DenseTensor* X,
+                             const DenseTensor* Out,
+                             const DenseTensor* dOut,
+                             const int n,
+                             DenseTensor* dX,
+                             const Context& ctx) {
+  ctx.template Alloc<T>(dX);
+  const auto& x_dims = X->dims();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  if (n == 0) {
+    // \nabla X = O
+    phi::funcs::SetConstant<Context, T> zero;
+    zero(ctx, dX, static_cast<T>(0));
+    return;
+  } else if (n == 1) {
+    // \nabla X = \nabla Out
+    paddle::framework::TensorCopy(*dOut, ctx.GetPlace(), ctx, dX);
+    return;
+  }
+
+  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (n == -1) {
+    // \nabla X = Out^{T} * \nabla Out * Out^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(*Out,
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                *Out,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+    return;
+  }
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  // Use chain rule blow to compute \nabla newX^{n}
+  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
+  // Note that newX^{0} can be omitted
+  std::vector<std::shared_ptr<DenseTensor>> tensor_list(new_n - 1);
+  tensor_list[0] = std::make_shared<DenseTensor>(new_x);
+  int index = 1;
+  while (index < new_n - 1) {
+    DenseTensor tensor_list_index;
+    tensor_list_index.Resize(X->dims());
+    ctx.template Alloc<T>(&tensor_list_index);
+    tensor_list[index] = std::make_shared<DenseTensor>(tensor_list_index);
+
+    blas.MatMul(*tensor_list[index - 1],
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                tensor_list[index].get(),
+                static_cast<T>(0));
+    index++;
+  }
+
+  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
+  //                      * \nabla Out
+  //                      * (newX^{T}^{n - i - 1})
+  DenseTensor dx_new;
+  dx_new.Resize(X->dims());
+  ctx.template Alloc<T>(&dx_new);
+  blas.MatMul(*tensor_list[new_n - 2],
+              trans_desc,
+              *dOut,
+              no_trans_desc,
+              static_cast<T>(1),
+              &dx_new,
+              static_cast<T>(0));
+  DenseTensor da_an_minus1;
+  da_an_minus1.Resize(X->dims());
+  ctx.template Alloc<T>(&da_an_minus1);
+  blas.MatMul(*dOut,
+              no_trans_desc,
+              *tensor_list[new_n - 2],
+              trans_desc,
+              static_cast<T>(1),
+              &da_an_minus1,
+              static_cast<T>(0));
+  blas.AXPY(
+      X->numel(), static_cast<T>(1), da_an_minus1.data<T>(), dx_new.data<T>());
+  int start = 0;
+  while (start < new_n - 2) {
+    DenseTensor a_da;
+    a_da.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da);
+    DenseTensor a_da_a;
+    a_da_a.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da_a);
+    blas.MatMul(*tensor_list[start],
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(1),
+                &a_da,
+                static_cast<T>(0));
+    blas.MatMul(a_da,
+                no_trans_desc,
+                *tensor_list[new_n - 3 - start],
+                trans_desc,
+                static_cast<T>(1),
+                &a_da_a,
+                static_cast<T>(0));
+    blas.AXPY(
+        X->numel(), static_cast<T>(1), a_da_a.data<T>(), dx_new.data<T>());
+    start++;
+  }
+
+  if (n > 0) {
+    // \nabla X = \nabla newX
+    paddle::framework::TensorCopy(dx_new, ctx.GetPlace(), ctx, dX);
+  } else {
+    // \nabla X = newX^{T} * \nabla newX * newX^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(new_x,
+                trans_desc,
+                dx_new,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                new_x,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad) {
+  auto X = &x;
+  auto Out = &out;
+  auto dOut = &out_grad;
+  auto dX = x_grad;
+
+  MatrixPowerGradFunction<Context, T>(X, Out, dOut, n, dX, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccc5e8757e87662d398006b931ea307c6328991e
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename T>
+struct IdentityMatrixFunctor {
+  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int row = index / m_ % m_;
+    const int col = index % m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_;
+  T* output_;
+};
+
+template <typename Context, typename T>
+void MatrixPowerFunction(const DenseTensor* X,
+                         const int n,
+                         DenseTensor* Out,
+                         const Context& ctx) {
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  T* out_data = ctx.template Alloc<T>(Out);
+
+  phi::funcs::ForRange<Context> for_range(ctx, X->numel());
+
+  if (n == 0) {
+    // Out = Identity Matrix
+    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
+    for_range(functor);
+    return;
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  if (new_n == 1) {
+    paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, Out);
+    return;
+  }
+
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (new_n == 2) {
+    // Out = newX * newX
+    ctx.template Alloc<T>(Out);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 3) {
+    // Out = (newX * newX) * newX
+    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
+    // gemm operations must be computable independently; otherwise,
+    // undefined behavior is expected.
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 4) {
+    // Out = (newX * newX) * (newX * newX)
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                temp,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  }
+
+  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
+  int bit = 0;
+  DenseTensor z = DenseTensor(X->dtype());
+  bool out_inited = false;
+  DenseTensor temp_out;
+  temp_out.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_out);
+  DenseTensor temp_z;
+  temp_z.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_z);
+  while (new_n > 0) {
+    bit = new_n & 0x1;
+    new_n >>= 1;
+    if (z.IsInitialized()) {
+      blas.MatMul(z,
+                  no_trans_desc,
+                  z,
+                  no_trans_desc,
+                  static_cast<T>(1),
+                  &temp_z,
+                  static_cast<T>(0));
+      paddle::framework::TensorCopy(temp_z, ctx.GetPlace(), ctx, &z);
+    } else {
+      z.Resize(X->dims());
+      ctx.template Alloc<T>(&z);
+      paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, &z);
+    }
+    if (bit == 1) {
+      if (out_inited == true) {
+        blas.MatMul(*Out,
+                    no_trans_desc,
+                    z,
+                    no_trans_desc,
+                    static_cast<T>(1),
+                    &temp_out,
+                    static_cast<T>(0));
+        paddle::framework::TensorCopy(temp_out, ctx.GetPlace(), ctx, Out);
+      } else {
+        paddle::framework::TensorCopy(z, ctx.GetPlace(), ctx, Out);
+        out_inited = true;
+      }
+    }
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out) {
+  const DenseTensor* X = &x;
+  auto Out = out;
+
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_ndim - 2],
+      x_dims[x_ndim - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) should be equal."
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          x_dims[x_ndim - 2],
+          x_dims[x_ndim - 1]));
+
+  MatrixPowerFunction<Context, T>(X, n, Out, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..546ea74674281b0524b970ea83cbe5e62afd779e
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  phi::funcs::SetConstant<Context, T> zero;
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    zero(dev_ctx, x_grad, static_cast<T>(0.0));
+    paddle::operators::math::MaxOutGradFunctor<Context, T> maxout_backward;
+    maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..da8c259ebf21716aed1fab336e94f5fafb2c69b6
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  paddle::operators::math::MaxOutFunctor<Context, T> maxout_forward;
+  maxout_forward(dev_ctx, x, out, groups, axis);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0833e94fe2c189f00e6dadff3f108753d0f66221
--- /dev/null
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -0,0 +1,456 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline DenseTensor MatMul(const Context& ctx,
+                          const DenseTensor& matrix_a,
+                          const DenseTensor& matrix_b,
+                          const phi::DDim& a_dim,
+                          const phi::DDim& b_dim) {
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor matrix_c;
+  phi::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
+  matrix_c.Resize(c_dim);
+  ctx.template Alloc<T>(&matrix_c);
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
+  const T alpha = static_cast<T>(1.0);
+  blas.MatMul(matrix_a.data<T>(),
+              mat_dim_a,
+              matrix_b.data<T>(),
+              mat_dim_b,
+              alpha,
+              matrix_c.data<T>(),
+              T(0));
+  return matrix_c;
+}
+
+/**
+ * @brief Recursively calculate matrix multiplication according to the optimal
+ * order
+ * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
+ *
+ * @param
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * save_result: set true by backward
+ * results: save the intermediate result during backward
+ */
+template <typename Context, typename T>
+inline DenseTensor MatChainMul(const Context& ctx,
+                               const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims,
+                               const std::vector<uint64_t>& order,
+                               const uint64_t i,
+                               const uint64_t j,
+                               const bool save_result,
+                               std::vector<DenseTensor>* results) {
+  if (i == j) {
+    return *ins[i];
+  }
+
+  const auto A = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         i,
+                                         order[i * ins.size() + j],
+                                         save_result,
+                                         results);
+  phi::DDim a_dim = A.dims();
+  if (i == order[i * ins.size() + j]) {
+    a_dim = ins_dims[i];
+  }
+
+  const auto B = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         order[i * ins.size() + j] + 1,
+                                         j,
+                                         save_result,
+                                         results);
+  phi::DDim b_dim = B.dims();
+  if (j == order[i * ins.size() + j] + 1) {
+    b_dim = ins_dims[j];
+  }
+
+  auto result = MatMul<Context, T>(ctx, A, B, a_dim, b_dim);
+  if (save_result) {
+    (*results)[i * ins.size() + j] = result;
+  }
+  return result;
+}
+
+/**
+ * @brief get the optimal order
+ */
+template <typename Context, typename T>
+std::vector<uint64_t> GetOrder(const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims) {
+  auto n = ins.size();
+  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
+  std::vector<uint64_t> p(n + 1);
+  for (uint64_t i = 0; i < n; i++) {
+    p[i] = ins_dims[i][0];
+  }
+  p[n] = ins_dims[n - 1][1];
+
+  // m[i, j]: save the lowest cost for multiplying ins[i...j]
+  std::vector<uint64_t> m(n * n, 0);
+  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
+  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
+  // multiply the resulting matrices is the optimal order for ins[i...j]
+  std::vector<uint64_t> order(n * n);
+  for (uint64_t l = 1; l < n; l++) {
+    for (uint64_t i = 0; i < n - l; i++) {
+      auto j = i + l;
+      m[i * n + j] = 0xffffffff;
+      for (uint64_t k = i; k < j; k++) {
+        uint64_t q =
+            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
+        if (q < m[i * n + j]) {
+          m[i * n + j] = q;
+          order[i * n + j] = k;
+        }
+      }
+    }
+  }
+  return order;
+}
+
+template <typename Context, typename T>
+static inline DenseTensor MultiDotMatChainOrder(
+    const Context& ctx,
+    const std::vector<const DenseTensor*>& ins,
+    const std::vector<phi::DDim>& ins_dims,
+    const bool save_result,
+    std::vector<DenseTensor>* results) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  return MatChainMul<Context, T>(
+      ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results);
+}
+
+template <typename Context, typename T>
+inline void GetDims(const std::vector<const DenseTensor*>& ins,
+                    std::vector<phi::DDim>* ins_dims) {
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    (*ins_dims)[i] = ins[i]->dims();
+    if (i == 0 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
+    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out) {
+  auto ins = x;
+  ctx.template Alloc<T>(out);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  auto n = ins.size();
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  const T scale = static_cast<T>(1.0);
+  if (n == 2) {
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ma, Nb});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
+    } else {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ka, Nc});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      std::cout << tmp_out << std::endl;
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
+    }
+  } else {
+    std::vector<DenseTensor> results;
+    const auto tmp =
+        MultiDotMatChainOrder<Context, T>(ctx, ins, ins_dims, false, &results);
+    auto out_dim = out->dims();
+    *out = tmp;
+    out->Resize(out_dim);
+  }
+}
+
+/**
+ * @brief calculate dA and dB
+ * dA = dout * transpose(B)
+ * dB = transpose(A) * dout
+ */
+template <typename Context, typename T>
+void CalcGrad(const Context& ctx,
+              const DenseTensor& dout,
+              const DenseTensor& A,
+              const DenseTensor& B,
+              const phi::DDim& dout_dim,
+              const phi::DDim& a_dim,
+              const phi::DDim& b_dim,
+              DenseTensor* dA,
+              DenseTensor* dB) {
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
+  T alpha = static_cast<T>(1.0);
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+  blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
+  blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
+}
+
+/**
+ * @brief calculate multi matrix multiplication grad by a chain order
+ * @param
+ * dout: the grad of multi matrix multiplication out
+ * dx: the out grad of inputs
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * results: the intermediate result of farward
+ */
+template <typename Context, typename T>
+void MatChainMulGrad(const Context& ctx,
+                     const DenseTensor& dout,
+                     std::vector<DenseTensor*>* dx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const phi::DDim& dout_dim,
+                     const std::vector<phi::DDim>& ins_dims,
+                     const std::vector<uint64_t>& order,
+                     const uint64_t i,
+                     const uint64_t j,
+                     const std::vector<DenseTensor>& results) {
+  if (i == j) {
+    *((*dx)[i]) = dout;
+    return;
+  }
+
+  const auto n = ins.size();
+  const auto right = order[i * n + j];
+  const auto left = order[i * n + j] + 1;
+  // get the multi result of left sub chain
+  const auto* A = &results[i * n + right];
+  phi::DDim a_dim = A->dims();
+  if (i == right) {
+    A = ins[i];
+    a_dim = ins_dims[i];
+  }
+  // get the multi result of right sub chain
+  const auto* B = &results[left * n + j];
+  phi::DDim b_dim = B->dims();
+  if (left == j) {
+    B = ins[j];
+    b_dim = ins_dims[j];
+  }
+  DenseTensor dA, dB;
+  dA.Resize({dout_dim[0], b_dim[0]});
+  dB.Resize({a_dim[1], dout_dim[1]});
+  ctx.template Alloc<T>(&dA);
+  ctx.template Alloc<T>(&dB);
+
+  CalcGrad<Context, T>(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
+  MatChainMulGrad<Context, T>(
+      ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results);
+  MatChainMulGrad<Context, T>(
+      ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results);
+}
+
+template <typename Context, typename T>
+void MultiDotGradMatChainOrder(const Context& ctx,
+                               const DenseTensor& dout,
+                               const std::vector<const DenseTensor*>& ins,
+                               const phi::DDim& dout_dim,
+                               const std::vector<phi::DDim>& ins_dims,
+                               std::vector<DenseTensor*>* dx) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  auto n = ins.size();
+  std::vector<DenseTensor> results(n * n);
+  MatChainMul<Context, T>(ctx, ins, ins_dims, order, 0, n - 1, true, &results);
+  MatChainMulGrad<Context, T>(
+      ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results);
+}
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad) {
+  auto ins = x;
+  auto dout = out_grad;
+  auto dx = x_grad;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    ctx.template Alloc<T>(dx[i]);
+  }
+
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  phi::DDim dout_dim = dout.dims();
+  if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
+    dout_dim = phi::make_ddim({1, 1});
+  } else if (ins[0]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({1, dout_dim[0]});
+    }
+  } else if (ins[n - 1]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({dout_dim[0], 1});
+    }
+  }
+
+  T alpha = static_cast<T>(1);
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  if (n == 2) {
+    CalcGrad<Context, T>(ctx,
+                         dout,
+                         *ins[0],
+                         *ins[1],
+                         dout_dim,
+                         ins_dims[0],
+                         ins_dims[1],
+                         dx[0],
+                         dx[1]);
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ma, Nb});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({mat_dim_dout.height_, Nb});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           tmp_out,
+                           *ins[2],
+                           dout_dim,
+                           tmp_out.dims(),
+                           ins_dims[2],
+                           &tmp_dout,
+                           dx[2]);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[0],
+                           *ins[1],
+                           tmp_dout.dims(),
+                           ins_dims[0],
+                           ins_dims[1],
+                           dx[0],
+                           dx[1]);
+    } else {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ka, Nc});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({Ka, mat_dim_dout.width_});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           *ins[0],
+                           tmp_out,
+                           dout_dim,
+                           ins_dims[0],
+                           tmp_dout.dims(),
+                           dx[0],
+                           &tmp_dout);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[1],
+                           *ins[2],
+                           tmp_dout.dims(),
+                           ins_dims[1],
+                           ins_dims[2],
+                           dx[1],
+                           dx[2]);
+    }
+  } else {
+    MultiDotGradMatChainOrder<Context, T>(
+        ctx, dout, ins, dout_dim, ins_dims, &dx);
+    if (ins[n - 1]->dims().size() == 1) {
+      dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pad_grad_kernel_impl.h b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..91f198f9fb681e4fabf7029fcc22343bb81953fd
--- /dev/null
+++ b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x) {
+  if (d_x == nullptr) {
+    return;
+  }
+  dev_ctx.template Alloc<T>(d_x);
+  int rank = d_out.dims().size();
+  phi::funcs::PaddingGradFunctor<Context, T>(
+      rank, dev_ctx, paddings, d_out, d_x);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3ebb0dfe03b2f13e2a321bb813f7d10e306b7a
--- /dev/null
+++ b/paddle/phi/kernels/impl/pad_kernel_impl.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  int rank = x.dims().size();
+  funcs::PaddingFunctor<Context, T>(
+      rank, dev_ctx, paddings, static_cast<T>(pad_value), x, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ba1a0c6b6c0fe37c6bea0a39016311d9b7cd5ff
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto index_type = segment_ids.type();
+  if (index_type == DataType::INT32) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int64_t> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a6df37ab3e35cacb4313f242ebb82e8d9061c41
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& segment_ids,
+                               const std::string& pooltype,
+                               DenseTensor* out,
+                               DenseTensor* summed_ids) {
+  int64_t num_indices = segment_ids.numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices,
+      x.dims()[0],
+      phi::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices,
+                    segment_ids.dims()[0],
+                    phi::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment_ids.dims()));
+
+  if (x.numel() == 0 || segment_ids.numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU;
+  if (cpu_place) {
+    auto dims = x.dims();
+    auto* segment_ids_ptr = segment_ids.data<IndexT>();
+    dims[0] =
+        static_cast<int64_t>(segment_ids_ptr[segment_ids.numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, out, static_cast<T>(0));
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (!cpu_place) {
+    DenseTensor length;
+    length.Resize(phi::make_ddim({1}));
+    IndexT* length_data = dev_ctx.template HostAlloc<IndexT>(&length);
+
+    const IndexT* segment_ids_ptr = segment_ids.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(length_data,
+                                         segment_ids_ptr + num_indices - 1,
+                                         sizeof(IndexT),
+                                         hipMemcpyDeviceToHost));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          cudaMemcpyDeviceToHost));
+#endif
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host,
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = x.dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    phi::funcs::SetConstant<Context, T> setconst;
+    setconst(dev_ctx, out, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids->Resize({dims[0], 1});
+      dev_ctx.template Alloc<T>(summed_ids);
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
+
+  phi::funcs::SegmentPoolFunctor<Context, T, IndexT> pool;
+
+  pool(dev_ctx, x, segment_ids, out, summed_ids, pooltype);
+}
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids) {
+  auto index_type = segment_ids.dtype();
+  if (index_type == DataType::INT32) {
+    SegmentKernelLaunchHelper<Context, T, int>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else if (index_type == DataType::INT64) {
+    SegmentKernelLaunchHelper<Context, T, int64_t>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d09c87b0a4ed268dc6e17815a3cc0072c54e382f
--- /dev/null
+++ b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out,
+                    const DenseTensor& dout,
+                    float scale,
+                    float alpha,
+                    DenseTensor* dx) {
+  auto dx_ptr = dev_ctx.template Alloc<T>(dx);
+  SeluGradFunctor<T> functor(
+      out.data<T>(), dout.data<T>(), alpha, scale, dx_ptr);
+  size_t limit = static_cast<size_t>(out.numel());
+  paddle::platform::ForRange<Context> for_range(dev_ctx, limit);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..888bac42bfd91c99af2599ecef038c6f5a5424c1
--- /dev/null
+++ b/paddle/phi/kernels/impl/selu_kernel_impl.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/operators/math.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T>
+struct SeluFunctor {
+  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
+      : x_data_ptr_(x_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        y_data_ptr_(y_data_ptr) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T x_ele = x_data_ptr_[idx];
+    if (x_ele <= 0) {
+      x_ele = alpha_ * paddle::operators::real_exp(x_ele) - alpha_;
+    }
+    y_data_ptr_[idx] = scale_ * x_ele;
+  }
+  const T* x_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  T* y_data_ptr_;
+};
+
+template <typename T>
+struct SeluGradFunctor {
+  SeluGradFunctor(const T* y_data_ptr,
+                  const T* dy_data_ptr,
+                  float alpha,
+                  float scale,
+                  T* dx_data_ptr)
+      : y_data_ptr_(y_data_ptr),
+        dy_data_ptr_(dy_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        la_(alpha * scale),
+        dx_data_ptr_(dx_data_ptr) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T y_ele = y_data_ptr_[idx];
+    T dy_ele = dy_data_ptr_[idx];
+
+    float tmp = scale_;
+    if (y_ele <= 0) {
+      tmp = y_ele + la_;
+    }
+    dx_data_ptr_[idx] = dy_ele * tmp;
+  }
+  const T* y_data_ptr_;
+  const T* dy_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  const float la_;
+  T* dx_data_ptr_;
+};
+
+template <typename T, typename Context>
+void SeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                float scale,
+                float alpha,
+                DenseTensor* out) {
+  auto out_ptr = dev_ctx.template Alloc<T>(out);
+  SeluFunctor<T> functor(x.data<T>(), alpha, scale, out_ptr);
+  size_t limit = static_cast<size_t>(x.numel());
+  paddle::platform::ForRange<Context> for_range(dev_ctx, limit);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5aebffe51b5e388bcc7ea72d1b804ffcb8768821
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -0,0 +1,337 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/slice_utils.h"
+
+namespace phi {
+
+// check whether the tensor with dimension of second can assign to the
+// tensor with dimension of first
+inline void CheckIsDimsMatch(const DDim& first, const DDim& second) {
+  int ignore_axis1 = 0, ignore_axis2 = 0;
+  for (; ignore_axis1 < first.size(); ++ignore_axis1) {
+    if (first[ignore_axis1] != 1) {
+      break;
+    }
+  }
+  for (; ignore_axis2 < second.size(); ++ignore_axis2) {
+    if (second[ignore_axis2] != 1) {
+      break;
+    }
+  }
+
+  if (second.size() == ignore_axis2) {
+    // second tensor has only one value
+    return;
+  }
+
+  if (first.size() - ignore_axis1 >= second.size() - ignore_axis2) {
+    auto idx1 = first.size() - 1;
+    auto idx2 = second.size() - 1;
+    bool is_match = true;
+    for (; idx2 >= ignore_axis2; idx2--) {
+      if (first[idx1--] != second[idx2] && second[idx2] != 1) {
+        is_match = false;
+        break;
+      }
+    }
+    if (is_match) {
+      return;
+    }
+  }
+  PADDLE_THROW(errors::InvalidArgument(
+      "The shape of tensor assigned value must match the shape "
+      "of target shape: %d, but now shape is %d.",
+      second.to_str(),
+      first.to_str()));
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueImpl(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& value,
+                  const ScalarArray& starts,
+                  const ScalarArray& ends,
+                  const ScalarArray& steps,
+                  const std::vector<int64_t>& axes,
+                  const std::vector<int64_t>& decrease_axes,
+                  const std::vector<int64_t>& none_axes,
+                  DenseTensor* out) {
+  auto in_dims = in.dims();
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::CheckAndUpdateSliceAttrs(
+      in_dims, axes, &starts_local, &ends_local, &steps_local);
+  auto slice_dims = paddle::operators::GetSliceDims(
+      in_dims, axes, starts_local, ends_local, &steps_local);
+  auto decrease_slice_dims =
+      paddle::operators::GetDecreasedDims(slice_dims, decrease_axes);
+
+  auto slice_dims_for_assign = decrease_slice_dims;
+  if (!none_axes.empty()) {
+    std::vector<int64_t> slice_dims_with_none;
+
+    size_t none_axes_cur = 0, decrease_axes_cur = 0;
+    for (int i = 0; i < slice_dims.size(); ++i) {
+      while (none_axes_cur < none_axes.size() &&
+             none_axes[none_axes_cur] <= i) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
+      if (decrease_axes_cur < decrease_axes.size() &&
+          decrease_axes[decrease_axes_cur] == i) {
+        decrease_axes_cur++;
+      } else {
+        slice_dims_with_none.push_back(slice_dims[i]);
+      }
+    }
+    while (none_axes_cur < none_axes.size()) {
+      slice_dims_with_none.push_back(1);
+      none_axes_cur++;
+    }
+
+    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+  }
+
+  auto place = dev_ctx.GetPlace();
+  auto& eigen_place = *dev_ctx.eigen_device();
+
+  // Here copy data from input to avoid data loss at PE and Graph level.
+  // TODO(liym27): Speed up in the future version.
+  // - Q: Why don't call ShareDataWith to speed up?
+  // - A: Because it's not supported to ShareDataWith on OP's input and output
+  // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
+  // - Q: Why don't delete Input, after all, the input and output are the same
+  // Tensor at program level?
+  // - A: If deleting Input, the graph will be complex, such as there will
+  // be two ops points to the output in graph: op1 -> output <- set_value.
+  // In this case, we have to find a way to handle the running order of
+  // set_value is what we want.
+  Copy(dev_ctx, in, place, false, out);
+
+  DenseTensor slice_tensor =
+      Empty<T>(dev_ctx, ScalarArray{slice_dims.Get(), slice_dims.size()});
+  DenseTensor pad_tensor =
+      Empty<T>(dev_ctx, ScalarArray{in_dims.Get(), in_dims.size()});
+
+  auto pad_e = EigenTensor<T, RANK>::From(pad_tensor, in_dims);
+  auto out_e = EigenTensor<T, RANK>::From(*out);
+  auto slice_e = EigenTensor<T, RANK>::From(slice_tensor, slice_dims);
+
+  // Step 1: Set the value of out at `_index` to zero
+  slice_e.device(eigen_place) = slice_e.constant(T(0));
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+
+  for (size_t i = 0; i < RANK; ++i) {
+    starts_indices[i] = 0;
+    ends_indices[i] = slice_dims[i];
+    strides_indices[i] = 1;
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int axis_index = axes[i];
+    starts_indices[axis_index] = starts_local[i];
+    ends_indices[axis_index] = ends_local[i];
+    strides_indices[axis_index] = steps_local[i];
+    if (starts_local[i] ==
+        ends_local[i]) {  // slice is empty, data will not be changed
+      return;
+    }
+  }
+
+  out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 2: Set a tensor with the same shape as out tensor. And its data at
+  // '_index' is the same as value, and data out of '_index' to zero
+
+  // - Step 2.1 Set slice tensor with value
+
+  // NOTE(liym27): [ Why resize slice_tensor here? ]
+  // A: When do broadcasting on slice_tensor and value, the shape of
+  // slice_tensor should be decreased dims.
+  // e.g.
+  //  x[:,0] = value
+  // x's shape = [3, 4], value's shape = [3]
+  // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+  // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+  // shape is [3, 3], which cross the border;
+  // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+  // is [3], which is right.
+
+  slice_tensor.Resize(slice_dims_for_assign);
+  CheckIsDimsMatch(slice_dims_for_assign, value.dims());
+  // ElementwiseComputeEx can do broadcasting
+  funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+      dev_ctx,
+      slice_tensor,
+      value,
+      -1,
+      funcs::SubtractFunctor<T>(),
+      &slice_tensor);
+
+  slice_tensor.Resize(slice_dims);
+
+  // - Step 2.2 Pad slice tensor with 0
+  pad_e.device(eigen_place) = pad_e.constant(T(0));
+  pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 3: Set out tensor with value
+  out_e.device(eigen_place) = out_e - pad_e;
+}
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out) {
+  const int rank = x.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueImpl<T, Context, 1>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 2:
+      SetValueImpl<T, Context, 2>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 3:
+      SetValueImpl<T, Context, 3>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 4:
+      SetValueImpl<T, Context, 4>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 5:
+      SetValueImpl<T, Context, 5>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 6:
+      SetValueImpl<T, Context, 6>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.", rank));
+  }
+}
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out) {
+  std::vector<T> assgin_values;
+  assgin_values.reserve(values.size());
+  for (const auto& val : values) {
+    assgin_values.push_back(val.to<T>());
+  }
+  DenseTensor value_tensor = Empty<T>(dev_ctx, shape);
+  paddle::framework::TensorFromVector(assgin_values, dev_ctx, &value_tensor);
+  value_tensor.Resize(phi::make_ddim(shape));
+
+  SetTensorValueKernel<T, Context>(dev_ctx,
+                                   x,
+                                   value_tensor,
+                                   starts,
+                                   ends,
+                                   steps,
+                                   axes,
+                                   decrease_axes,
+                                   none_axes,
+                                   out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/kernels/impl/shape_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..982cfb33f6b14fc14c7c58ff8c4548a4cdbd3b3b
--- /dev/null
+++ b/paddle/phi/kernels/impl/shape_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2c2720244fe8a714e0e866dd6178a1a238728b7
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Dims>
+void TileBackward(const Context& dev_ctx,
+                  const DenseTensor& out_grad,
+                  const std::vector<int>& reshape_dims_vec,
+                  const std::vector<int>& reduce_dims_vec,
+                  DenseTensor* x_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, eigen_x_grad, eigen_out_grad, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad) {
+  auto x_dims = x.dims();
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto repeat_times_data = repeat_times.GetData();
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  // 1. reshape_dims_vec is the broadcast parameter.
+  // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+  //    each dimension expanded, the gradients should be summed to original
+  //    size.
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times_data[i]);
+    reshape_dims_vec.push_back(vec_x_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times_data.size(); i++) {
+    if (repeat_times_data[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    dev_ctx.template Alloc<T>(x_grad);
+
+    paddle::framework::TensorCopy(
+        out_grad, dev_ctx.GetPlace(), dev_ctx, x_grad);
+    // TensorCopy may change the dims of dx
+    x_grad->Resize(x_dims);
+  } else {
+    PADDLE_ENFORCE_GE(dims,
+                      1,
+                      errors::InvalidArgument(
+                          "Th rank of the input 'Out@GRAD' for tile_grad op "
+                          " must be greater than or equal to 1, but "
+                          "the value received is %d.",
+                          dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        TileBackward<Context, T, 1>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 2:
+        TileBackward<Context, T, 2>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 3:
+        TileBackward<Context, T, 3>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 4:
+        TileBackward<Context, T, 4>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 5:
+        TileBackward<Context, T, 5>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 6:
+        TileBackward<Context, T, 6>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bafbbde4e680daa04a62ba9ab6b03778b4477107
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_kernel_impl.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void Tile(const Context& dev_ctx,
+          const DenseTensor& x,
+          std::vector<int64_t> repeat_times,
+          DenseTensor* out) {
+  auto x_dims = x.dims();
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times[i]));
+  }
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  if (repeat_times.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  PADDLE_ENFORCE_EQ(
+      repeat_times.size(),
+      vec_x_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_x_dims.size(),
+          repeat_times.size()));
+
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim out_dims(new_x_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+
+  out->Resize(out_dims);
+  auto eigen_x = EigenTensor<T, Rank>::From(x, new_x_dims);
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *dev_ctx.eigen_device();
+  // use 32-bit index to speed up
+  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+  if (use_32bit_index) {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, To32BitIndex(eigen_out), To32BitIndex(eigen_x), bcast_dims);
+  } else {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, eigen_out, eigen_x, bcast_dims);
+  }
+}
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto& repeat_times_data = repeat_times.GetData();
+  int repeat_times_size = repeat_times_data.size();
+  rank = std::max(rank, repeat_times_size);
+
+  switch (rank) {
+    case 1:
+      Tile<Context, T, 1>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 2:
+      Tile<Context, T, 2>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 3:
+      Tile<Context, T, 3>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 4:
+      Tile<Context, T, 4>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 5:
+      Tile<Context, T, 5>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 6:
+      Tile<Context, T, 6>(dev_ctx, x, repeat_times_data, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bb555fe28f1123065c01505cd6d11435ac09a7c
--- /dev/null
+++ b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
+  std::vector<int> reversed_axis(axis);
+
+  dev_ctx.template Alloc<T>(x_grad);
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
+
+  TransposeKernel<T, Context>(dev_ctx, out_grad, reversed_axis, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b1e4b1d3a65d5c0da831a36152cff85a3353fa3
--- /dev/null
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy) {
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+
+  ScalarArray y_bst_dims_array(y_bst_dims_vec);
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims_array);
+  if (dy) {
+    // calculate x's conjugate for complex
+    DenseTensor x_conj;
+    x_conj.Resize(x.dims());
+
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, x.numel());
+    phi::funcs::ConjFunctor<T> x_functor(
+        x.data<T>(), x.numel(), dev_ctx.template Alloc<T>(&x_conj));
+    x_for_range(x_functor);
+
+    // reuse forward to get dy_bst, and the result has been broadcated already.
+    TriangularSolveKernel<T, Context>(
+        dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst);
+
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+    if (dy_bst.dims() == y.dims()) {
+      Copy<Context>(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dy_bst, dy);
+      dy->Resize(y.dims());
+    }
+  }
+
+  ScalarArray x_bst_dims_array(x_bst_dims_vec);
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+  if (dx) {
+    // calculate x's conjugate for complex
+    DenseTensor out_conj;
+    out_conj.Resize(out.dims());
+
+    phi::funcs::ForRange<Context> out_for_range(dev_ctx, out.numel());
+    phi::funcs::ConjFunctor<T> out_functor(
+        out.data<T>(), out.numel(), dev_ctx.template Alloc<T>(&out_conj));
+    out_for_range(out_functor);
+
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+    if (transpose) {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
+      blas.MatMul(out_conj,
+                  mat_dim_a,
+                  dy_bst,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    } else {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
+      blas.MatMul(dy_bst,
+                  mat_dim_a,
+                  out_conj,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    }
+
+    // get upper or lower triangular
+    DenseTensor dx_bst_upper =
+        phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+
+    const auto& dims = dx_bst.dims();
+    const auto H = dims[dims.size() - 2];
+    const auto W = dims[dims.size() - 1];
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
+    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+        dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
+    x_for_range(tril_triu_functor);
+
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+    if (dx_bst.dims() == x.dims()) {
+      Copy<Context>(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dx_bst_upper, dx);
+      dx->Resize(x.dims());
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_sample_grad_kernel.h b/paddle/phi/kernels/index_sample_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c6e101f1b43df04d58da25fd7252f0ff929386e
--- /dev/null
+++ b/paddle/phi/kernels/index_sample_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_sample_kernel.h b/paddle/phi/kernels/index_sample_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb43c0c6c5f97c6d47381c72786c6e44441e7762
--- /dev/null
+++ b/paddle/phi/kernels/index_sample_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26c2f978005f292ed6960017dbcd5a2f68d4c9b1
--- /dev/null
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/is_empty_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   DenseTensor* out) {
+  // Note: is_empty is always executed on CPU and the output data should
+  // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
+  // avoid the unnecessary data transform.
+  bool* out_data = dev_ctx.template HostAlloc<bool>(out);
+  out_data[0] = phi::product(x.dims()) == 0;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(is_empty,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(is_empty,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.h b/paddle/phi/kernels/is_empty_kernel.h
similarity index 79%
rename from paddle/infrt/kernel/phi/allocator_kernels.h
rename to paddle/phi/kernels/is_empty_kernel.h
index d10382f5e6014c2b04dab65c8439d99e4563aaef..3bcf6f9054ed50a81f9b237e754d7d8aed51353a 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.h
+++ b/paddle/phi/kernels/is_empty_kernel.h
@@ -14,15 +14,11 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator();
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/phi/kernels/isfinite_kernel.h
similarity index 54%
rename from paddle/fluid/operators/matrix_power_op.cu
rename to paddle/phi/kernels/isfinite_kernel.h
index d972e9499dc88444e2addc9c9082d9e6fd496e08..e695a8e0742235a1da3fa829f82fd96c05fada34 100644
--- a/paddle/fluid/operators/matrix_power_op.cu
+++ b/paddle/phi/kernels/isfinite_kernel.h
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
+#include "paddle/phi/core/dense_tensor.h"
 
-REGISTER_OP_CUDA_KERNEL(matrix_power,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, float>,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, double>);
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, float>,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, double>);
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel) \
+  template <typename T, typename Context>       \
+  void isfinite_kernel(                         \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel)
+DEFINE_ISFINITE_KERNEL(IsnanKernel)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/linspace_kernel.h b/paddle/phi/kernels/linspace_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca2b940aef965b6772f113a6a96f84d64fb3d1a2
--- /dev/null
+++ b/paddle/phi/kernels/linspace_kernel.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_loss_grad_kernel.h b/paddle/phi/kernels/log_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6853140b19b907e0dc427e14b88f77726157612f
--- /dev/null
+++ b/paddle/phi/kernels/log_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       float epsilon,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_loss_kernel.h b/paddle/phi/kernels/log_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd16c0f2c7ce2cb4711250c44fd3081e3b05e867
--- /dev/null
+++ b/paddle/phi/kernels/log_loss_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   float epsilon,
+                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/logical_kernel.h b/paddle/phi/kernels/logical_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ccc03a5b598a0a939cde00d74e1f6126808f655
--- /dev/null
+++ b/paddle/phi/kernels/logical_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DECLEAR_LOGICAL_BINARY_KERNEL(type)          \
+  template <typename T, typename Context>            \
+  void Logical##type##Kernel(const Context& dev_ctx, \
+                             const DenseTensor& x,   \
+                             const DenseTensor& y,   \
+                             DenseTensor* out);
+
+DECLEAR_LOGICAL_BINARY_KERNEL(And)
+DECLEAR_LOGICAL_BINARY_KERNEL(Or)
+DECLEAR_LOGICAL_BINARY_KERNEL(Xor)
+#undef DECLEAR_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 3cb7b66ddf73e5fa3c5502a4acaad2c277a22ac6..a5d3f51e5447fa41447c4b59c3beb8c917f8a0e5 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int16_t,
                    int,
                    int64_t,
@@ -196,7 +197,8 @@ PD_REGISTER_KERNEL(subtract,
                    int64_t,
                    phi::dtype::float16,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(divide,
                    GPU,
                    ALL_LAYOUT,
@@ -206,6 +208,7 @@ PD_REGISTER_KERNEL(divide,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(multiply,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index c6036f4a0421b87860dcf9301c77f689fab2c952..7569cbcff087d796313c24d46ff7b7fd9cf7e2eb 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -109,7 +109,7 @@ template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -120,7 +120,7 @@ template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -131,7 +131,7 @@ template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -142,7 +142,7 @@ template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -154,9 +154,9 @@ DenseTensor Mean(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& axis,
                  bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ReduceInferMetaBase(x, axis, keep_dim, x.dtype(), &meta_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
@@ -167,7 +167,7 @@ DenseTensor Sum(const Context& dev_ctx,
                 const std::vector<int64_t>& axis,
                 DataType dtype,
                 bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
   SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index 8fc060d2e3dbcba93658021f5bfe295da58a3eb6..b524b9e5863dcbcacaea11df9a96b71570312213 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -33,9 +33,9 @@ template <typename T, typename Context>
 DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
-                   bool transpose_x,
-                   bool transpose_y) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+                   bool transpose_x = false,
+                   bool transpose_y = false) {
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out);
   MatmulKernel<T, Context>(dev_ctx, x, y, transpose_x, transpose_y, &dense_out);
diff --git a/paddle/phi/kernels/matrix_power_grad_kernel.h b/paddle/phi/kernels/matrix_power_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f70cf6e34d491bacf00c575bc28c1d7a39f859a
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_kernel.h b/paddle/phi/kernels/matrix_power_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..39a1bc85e3fe77e79ef8172f1bc4a22029b83d9d
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_grad_kernel.h b/paddle/phi/kernels/maxout_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ee4e8cc89676f49fa84e3a6e481e8e5d687614e
--- /dev/null
+++ b/paddle/phi/kernels/maxout_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_kernel.h b/paddle/phi/kernels/maxout_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e582575678d4d17d3f9d55183e58043fc6444225
--- /dev/null
+++ b/paddle/phi/kernels/maxout_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6d8ecd744e12a9b17bcd954eaf19093d8c694df
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_kernel.h b/paddle/phi/kernels/multi_dot_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..09866e8dde5eaa8adc773a481c98178da8ffed9e
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..127dc2f961f101e32a11ebea6bdfe4556546b8aa
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& label,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b271f0f4d06a05abd2c4b60556a1fb1755fef1bf
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   paddle::optional<const DenseTensor&> weight,
+                   int64_t ignore_index,
+                   const std::string& reduction,
+                   DenseTensor* out) {
+  DenseTensor total_weight;
+  total_weight.set_meta(
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(), {1}));
+  dev_ctx.template Alloc<T>(total_weight);
+  NllLossRawKernel(dev_ctx,
+                   input,
+                   label,
+                   weight,
+                   ignore_index,
+                   reduction,
+                   out,
+                   &total_weight);
+}
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..90083e1d6840d32e96870c7d5402c2fc99cd6bad
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.h
@@ -0,0 +1,33 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/norm_grad_kernel.h b/paddle/phi/kernels/norm_grad_kernel.h
index 7b09d6463d08b086653cb049f4ace9218439b4f8..55714b8a4a091f6d64cbb9a03eb9043d4c2dbf22 100644
--- a/paddle/phi/kernels/norm_grad_kernel.h
+++ b/paddle/phi/kernels/norm_grad_kernel.h
@@ -20,9 +20,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& out,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/pad_grad_kernel.h b/paddle/phi/kernels/pad_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f39d87e5c0ef6503d772e5f9ee95e307a13eda13
--- /dev/null
+++ b/paddle/phi/kernels/pad_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad_kernel.h b/paddle/phi/kernels/pad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..511e8cf73df97ffb250b1106aa98155de33a97d1
--- /dev/null
+++ b/paddle/phi/kernels/pad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 4f3c069f3b249161eb83698c4ded150b8f003b14..632ad00f6d06ed8a02b2d9677ff665c677cf8cb9 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,6 +22,7 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
@@ -136,7 +137,9 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
   return shared_memory[threadIdx.x];
 }
 
-// Swap data
+/**
+ * @brief Swap data
+ */
 template <typename T>
 __device__ __forceinline__ void Swap(T* first_value, T* second_value) {
   T t_value;
@@ -145,7 +148,9 @@ __device__ __forceinline__ void Swap(T* first_value, T* second_value) {
   (*second_value) = t_value;
 }
 
-// swap with monotonic_type
+/**
+ * @brief Swap data according to  monotonic_type.
+ */
 template <typename T>
 __device__ __forceinline__ void Comparator(T* first_value,
                                            T* second_value,
@@ -155,6 +160,9 @@ __device__ __forceinline__ void Comparator(T* first_value,
   }
 }
 
+/**
+ * @brief Swap data and data index according to  monotonic_type.
+ */
 template <typename T, typename IndexType>
 __device__ __forceinline__ void ComparatorWithIndex(T* first_value,
 
@@ -170,6 +178,18 @@ __device__ __forceinline__ void ComparatorWithIndex(T* first_value,
   }
 }
 
+/**
+ * @brief get the last pow of 2
+ */
+__device__ inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
 }  // namespace details
 
 /**
@@ -453,6 +473,29 @@ __device__ __forceinline__ void Reduce(T* out,
   }
 }
 
+/*
+* @brief Fill register with a constant according to OpFunc
+*
+* @template paraments
+* InT: The data type of in1 and in2.
+* OutT: The data type of out.
+* NX: The number of data columns loaded by each thread.
+* NY: The number of data rows loaded by each thread.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename InT>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()()
+* const {
+*         return a;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<InT>().
+*/
 template <typename InT,
           typename OutT,
           int NX,
@@ -466,6 +509,33 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
   }
 }
 
+/*
+* @brief Get ReturnsCount random data fromm compute according to state, state
+* can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
+* initialized.
+*
+* @template paraments
+* StateType: the type of state, can be curandStatePhilox4_32_10_t or
+* hiprandStatePhilox4_32_10_t.
+* OutT: the type of out register.
+* ReturnsCount: The number of random data generated by OpFunc.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename T>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()(StateType state)
+* const {
+*         return ranomd(state);  // Returns ReturnsCount random numbers with
+* data type T
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<T>().
+*/
+
 template <typename StateType,
           typename OutT,
           int ReturnsCount,
@@ -481,131 +551,208 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
   }
 }
 
-// attention please set share_size = blockDim.x;
-// data and b are the register pointer
-#define shared_size 64
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+/*
+* @brief Complete the prefix and in the block, each thread calculates 2 data,
+* the size of out and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: the type of input register.
+* OutT: the type of out register.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename T>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()(T a, T b)
+* const {
+*         return a + b;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is 2;
+* in: The register pointer of input, the size is 2;
+* compute: Compute function which was declared like OpFunc<T>().
+*/
+
+#define SHARED_SIZE_LIMIT 512
+template <typename InT, typename OutT, int BlockSize, class OpFunc>
 __device__ __forceinline__ void Cumsum(OutT* out,
                                        const InT* in,
                                        OpFunc compute) {
-  __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32];
+  constexpr int kSize = SHARED_SIZE_LIMIT * 2 + (SHARED_SIZE_LIMIT * 2) / 32;
+  __shared__ InT temp[kSize];
+  int stride_size = blockDim.x;
   int tidx = threadIdx.x;
   temp[tidx + tidx / 32] = in[0];
-  temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1];
-  for (int stride = 1; stride <= blockDim.x; stride *= 2) {
+  temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1];
+  for (int stride = 1; stride <= stride_size; stride *= 2) {
     __syncthreads();
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
-      temp[index + index / 32] += temp[index - stride + (index - stride) / 32];
+      temp[index + index / 32] =
+          compute(temp[index + index / 2],
+                  temp[index - stride + (index - stride) / 32]);
     }
   }
   for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) {
     __syncthreads();
     int index = (tidx + 1) * 2 * stride - 1;
     if ((index + stride) < (blockDim.x * 2)) {
-      temp[index + stride + (stride + index) / 32] +=
-          temp[index + (index) / 32];
+      temp[index + stride + (stride + index) / 32] =
+          compute(temp[index + stride + (stride + index) / 32],
+                  temp[index + (index) / 32]);
     }
   }
 
   __syncthreads();
   out[0] = static_cast<OutT>(temp[tidx + tidx / 32]);
   out[1] =
-      static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
+      static_cast<OutT>(temp[tidx + stride_size + (tidx + stride_size) / 32]);
 }
-
-#define SHARED_SIZE_LIMIT \
-  1024  // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
-        // larger than blockDim.x * 2
-// if monotonic_type = 1 then increase
-// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2
-// == 1 the increase
-template <typename T>
-__device__ __forceinline__ void Sort(T* dst,
-                                     const T* src_data,
+#undef SHARED_SIZE_LIMIT
+
+/*
+* @brief Sort data in this block, each thread calculates 2 data, the size of out
+* and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: the type of input register.
+* OutT: the type of out register.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+*
+* @param
+* out: The register pointer of out, the size is 2.
+* in: The register pointer of input, the size is 2.
+* num: The num of this block
+* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+* sorted in escending.
+*/
+#define SHARED_SIZE_LIMIT 1024
+// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
+// larger than blockDim.x * 2
+template <typename InT, typename OutT, int BlockSize>
+__device__ __forceinline__ void Sort(OutT* out,
+                                     const InT* in,
                                      int num,
                                      int monotonic_type) {
-  // todo: set  num = Pow2(num)
+  int upper_bound = blockDim.x;
+  // update upper_bound
+  upper_bound = std::min(details::GetLastPow2(num), upper_bound);
   // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
-  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
-                                          // blockDim * 2
-  // Copy value and index from src and src_index
-  value[threadIdx.x] = src_data[0];
-  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  __shared__ InT value[SHARED_SIZE_LIMIT];
+  int stride_size = blockDim.x;
+  // shareMem's size must larger than blockDim * 2
+  // Copy value from in
+  value[threadIdx.x] = in[0];
+  value[threadIdx.x + stride_size] = in[1];
   // make bitonicSort
-  for (int size = 2; size < num; size <<= 1) {
+  for (int size = 2; size < upper_bound; size <<= 1) {
     int bitonic_type = (threadIdx.x & (size / 2)) != 0;
     for (int stride = size / 2; stride > 0; stride >>= 1) {
       __syncthreads();
       int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      details::Comparator<T>(&value[pos], &value[pos + stride], bitonic_type);
+      details::Comparator<InT>(&value[pos], &value[pos + stride], bitonic_type);
     }
   }
   // last sort
-  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+  for (int stride = stride_size; stride > 0; stride >>= 1) {
     __syncthreads();
     int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     // last sort when monotonic_type = 1 then increase
-    details::Comparator<T>(&value[pos], &value[pos + stride], monotonic_type);
+    details::Comparator<InT>(&value[pos], &value[pos + stride], monotonic_type);
   }
   __syncthreads();
-  dst[0] = value[threadIdx.x];
-  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  out[0] = static_cast<OutT>(value[threadIdx.x]);
+  out[1] = static_cast<OutT>(value[threadIdx.x + stride_size]);
 }
 
-template <typename T, typename IndexType>
-__device__ __forceinline__ void Sort(T* dst,
-                                     IndexType* dst_index,
-                                     const T* src_data,
-                                     IndexType* src_index,
+/*
+* @brief Sort data with data_index in this block, each thread calculates 2 data,
+* the size of out and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: The type of input register.
+* OutT: The type of out register.
+* IndexType: The type of index.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+*
+* @param
+* out: The register pointer of out, the size is 2.
+* out_index: The register pointer of out_index, the size is 2.
+* in: The register pointer of input, the size is 2.
+* in_index: The register pointer of in_index, the size is 2.
+* num: The num of this block.
+* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+* sorted in escending.
+*/
+template <typename InT, typename OutT, typename IndexType, int BlockSize>
+__device__ __forceinline__ void Sort(OutT* out,
+                                     IndexType* out_index,
+                                     const InT* in,
+                                     IndexType* in_index,
                                      int num,
                                      int monotonic_type) {
-  // todo: set  num = Pow2(num)
+  int upper_bound = blockDim.x;
+  // update upper_bound
+  upper_bound = std::min(details::GetLastPow2(num), upper_bound);
   // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
-  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
-                                          // blockDim * 2
+  __shared__ InT value[SHARED_SIZE_LIMIT];
+  // shareMem's size must larger than blockDim * 2
   __shared__ IndexType index[SHARED_SIZE_LIMIT];
-  // Copy value and index from src and src_index
-  value[threadIdx.x] = src_data[0];
-  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // Copy value and index from in and in_index
+  int stride_size = blockDim.x;
+  value[threadIdx.x] = in[0];
+  value[threadIdx.x + stride_size] = in[1];
   // index
-  index[threadIdx.x] = src_index[0];
-  index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1];
+  index[threadIdx.x] = in_index[0];
+  index[threadIdx.x + stride_size] = in_index[1];
   // make bitonicSort
-  for (int size = 2; size < num; size <<= 1) {
+  for (int size = 2; size < upper_bound; size <<= 1) {
     int bitonic_type = (threadIdx.x & (size / 2)) != 0;
     for (int stride = size / 2; stride > 0; stride >>= 1) {
       __syncthreads();
       int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      details::ComparatorWithIndex<T, IndexType>(&value[pos],
-                                                 &value[pos + stride],
-                                                 &index[pos],
-                                                 &index[pos + stride],
-                                                 bitonic_type);
+      details::ComparatorWithIndex<InT, IndexType>(&value[pos],
+                                                   &value[pos + stride],
+                                                   &index[pos],
+                                                   &index[pos + stride],
+                                                   bitonic_type);
     }
   }
 
-  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+  for (int stride = stride_size; stride > 0; stride >>= 1) {
     __syncthreads();
     int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     // last sort when monotonic_type = 1 then increase
-    details::ComparatorWithIndex<T, IndexType>(&value[pos],
-                                               &value[pos + stride],
-                                               &index[pos],
-                                               &index[pos + stride],
-                                               monotonic_type);
+    details::ComparatorWithIndex<InT, IndexType>(&value[pos],
+                                                 &value[pos + stride],
+                                                 &index[pos],
+                                                 &index[pos + stride],
+                                                 monotonic_type);
   }
 
   __syncthreads();
-  dst[0] = value[threadIdx.x];
-  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-  dst_index[0] = index[threadIdx.x];
-  dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  out[0] = static_cast<OutT>(value[threadIdx.x]);
+  out[1] = static_cast<OutT>(value[threadIdx.x + stride_size]);
+  out_index[0] = index[threadIdx.x];
+  out_index[1] = index[threadIdx.x + stride_size];
+}
+
+template <typename T1, typename T2, typename OutT, typename OpFunc>
+HOSTDEVICE __forceinline__ void OperatorTernary(
+    OutT* out, const T1* in1, const T2* in2, OpFunc func, int num) {
+  func(out, in1, in2, num);
+}
+
+template <typename InT, typename OutT, typename OpFunc>
+HOSTDEVICE __forceinline__ void OperatorBinary(OutT* out,
+                                               const InT* in,
+                                               OpFunc func,
+                                               int num) {
+  func(out, in, num);
 }
 
 }  // namespace kps
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index a445f4a02ea714b2b2851d4de178b5ba76f5678d..1f4ef2ed932e9f986e0c59d9b4da891817cf7afe 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -348,6 +348,29 @@ __device__ __forceinline__ void Reduce(T* out,
   }
 }
 
+/*
+* @brief Fill register with a constant according to OpFunc
+*
+* @template paraments
+* InT: The data type of in1 and in2.
+* OutT: The data type of out.
+* NX: The number of data columns loaded by each thread.
+* NY: The number of data rows loaded by each thread.
+* BlockSize: Identifies the current device thread index method. For xpu,
+* core_id() is used as the index.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename InT>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()()
+* const {
+*         return a;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<InT>().
+*/
 template <typename InT,
           typename OutT,
           int NX,
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index a6c4c40a7505e11f9b2d971fdb48c4bc94ab5aa7..2f1e2f589c5122987d9776700f3aa7bd95daa7a5 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -297,6 +297,24 @@ __device__ __forceinline__ void ReadData(T* dst,
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T,
           int NX,
@@ -714,6 +732,20 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Initialize register with data index.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ * NY: Number of data to initialize, NY only can be 1.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
 template <typename T, int NX, int NY, int BlockSize>
 __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
   int thread_offset = block_offset + threadIdx.x * NX;
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 75b2dbaf7e6a305fdb32ae3738944922fb4a93a5..53a8b7d0c9ef9489056ab293d97e5767b23531fe 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -244,6 +244,24 @@ __device__ __inline__ void ReadData(T* dst,
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T,
           int NX,
@@ -646,5 +664,28 @@ __device__ __inline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Initialize register with data index.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ * NY: Number of data to initialize, NY only can be 1.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
+  int thread_offset = block_offset + core_id() * NX;
+#pragma unroll
+  for (int nx = 0; nx < NX; ++nx) {
+    dst[nx] = static_cast<T>(thread_offset + nx);
+  }
+}
+
 }  // namespace kps
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index 830bc1972c49fe8c447e9a13f874841d36a12f2d..b5a1e88acc32b1b101a6f81b750be1c669236a1a 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
+
+// macro
 #ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
@@ -22,11 +25,6 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
-
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -42,11 +40,8 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
+
 #else
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
@@ -67,4 +62,22 @@
 #define GRID_NUM_X gridDim.x
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
+
+#endif
+
+// include file
+#ifdef PADDLE_WITH_XPU_KP
+
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
+#else
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
 #endif
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..87163eb8e079ffd580d6f937179e24a8506376e9
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..341037af2caeca28e211da8862e3c8d6089b9bac
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2141443da7ab17776a72572fd7634bbf822a7f66
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..797d0e364b48d46372992590f7807b4c6af5f242
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de172a12d72884fb018acbb42c077efc825508ce
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7560473d43c718a80cfb8911cd250ef8fc74d82c
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/reduce_prod_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e92b6c4db14e735c64fffefa37d083c3b7408ca
--- /dev/null
+++ b/paddle/phi/kernels/reduce_prod_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/reduce_sum_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab4d63297efffc70710e496efa08f4b9c7e5f7ce
--- /dev/null
+++ b/paddle/phi/kernels/reduce_sum_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h
index 1a3d0db8a8a3b8668431ced5b95480f5d4758566..848f162a2a881ddc4d4ea136313216fd569accfd 100644
--- a/paddle/phi/kernels/reshape_kernel.h
+++ b/paddle/phi/kernels/reshape_kernel.h
@@ -38,7 +38,7 @@ template <typename T, typename Context>
 DenseTensor Reshape(const Context& dev_ctx,
                     const DenseTensor& x,
                     const std::vector<int64_t>& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   InferMetaFromVecValue(x, shape, &meta_out);
   ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h
index b395834369851e7491ecb592895dd5b9ad13aabd..7537dc1130b83ea3963ae128bf8ce3859411c199 100644
--- a/paddle/phi/kernels/scale_kernel.h
+++ b/paddle/phi/kernels/scale_kernel.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 namespace phi {
@@ -29,21 +28,13 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
-template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out);
-
 template <typename T, typename Context>
 DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ScaleKernel<T, Context>(
diff --git a/paddle/phi/kernels/scatter_grad_kernel.h b/paddle/phi/kernels/scatter_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf1482fca7f667eae530e09a95df0020186aef77
--- /dev/null
+++ b/paddle/phi/kernels/scatter_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_kernel.h b/paddle/phi/kernels/scatter_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5191d6bce45f26c69d5a2fe61c23f71039c2433c
--- /dev/null
+++ b/paddle/phi/kernels/scatter_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_nd_add_grad_kernel.h b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcfdb2cdb2f09e53ed9da1bc3995587737f9492c
--- /dev/null
+++ b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_nd_add_kernel.h b/paddle/phi/kernels/scatter_nd_add_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c20709dccc08c16650dd1fc00ec10a91c333e13f
--- /dev/null
+++ b/paddle/phi/kernels/scatter_nd_add_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e773eed16e8c838d9f70587dbcc6dee50c057424
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_kernel.h b/paddle/phi/kernels/segment_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f7b30c2e8603d1b59c8217ef3e9e21d072bf1d0
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 02231867fdd35cc8db4359fa3cc31d6236229afc..39fd009cd6586bd7413e9d215607603f5d0d2b0a 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -12,34 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out) {
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out) {
   phi::FullKernel<T>(dev_ctx, shape, val, dtype, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
@@ -56,7 +59,7 @@ PD_REGISTER_KERNEL(full_sr,
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d84ddcc0d3f63bdc95db97a933f53076baa8a02c
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/full_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a507cdd0d866c3677d74956d1146139e6f2f92c2
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& dev_ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x.value(), out->mutable_value());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(isinf_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
similarity index 52%
rename from paddle/fluid/operators/optimizers/adamax_op.cu
rename to paddle/phi/kernels/selected_rows/isfinite_kernel.h
index 80e0219d4414db2909b5babc22599d8c0d906c7d..948d8c89477a25dd24f8edca374701b42c8de5e4 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+#define DEFINE_ISFINITE_SR(isfinite_sr)   \
+  template <typename T, typename Context> \
+  void isfinite_sr(                       \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out);
+
+DEFINE_ISFINITE_SR(IsinfSR)
+DEFINE_ISFINITE_SR(IsnanSR)
+DEFINE_ISFINITE_SR(IsfiniteSR)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c53abdf996c477d097cc58ac36d944944cbcab9b
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out);
+
+#define DEFINE_ISFINITE_SR(isfinite_sr, functor)                      \
+  template <typename T, typename Context>                             \
+  void isfinite_sr(                                                   \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out) { \
+    IsfiniteSRImpl<T, Context, functor>(ctx, x, out);                 \
+  }
+
+DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor)
+DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 094b6f4d12022be07910bac68f09d201040b364a..38a0cb75101b7ea5e3f423aa997e003977c56030 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/phi/kernels/selected_rows/scale_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out) {
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out) {
   if (x.value().Holder() != out->value().Holder() ||
       x.value().data() != out->value().data()) {
     out->set_rows(x.rows());
@@ -36,12 +38,13 @@ void ScaleSR(const Context& dev_ctx,
       dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::bfloat16,
@@ -55,7 +58,7 @@ PD_REGISTER_KERNEL(scale_sr,
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.h b/paddle/phi/kernels/selected_rows/scale_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..85c2c4ddff0333cdd0603f2fb4235b17bbdaffad
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bcd5d8544e2d73961d72115023446d427e8895e
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out) {
+  auto in_var = input;
+  phi::DDim in_dims;
+  in_dims = in_var.value().dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(shape_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+#endif
diff --git a/paddle/fluid/distributed/ps/ps.proto b/paddle/phi/kernels/selected_rows/shape_kernel.h
old mode 100755
new mode 100644
similarity index 63%
rename from paddle/fluid/distributed/ps/ps.proto
rename to paddle/phi/kernels/selected_rows/shape_kernel.h
index 2691f637527d49de48cfa818047edaa484f1a1b0..86ba52982b5967481f50cb661b15c84ed3751edd
--- a/paddle/fluid/distributed/ps/ps.proto
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.h
@@ -10,4 +10,19 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the License. */
\ No newline at end of file
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3dd1d1b7d2a025a80123f480a9deeb0f7ca5932
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/uniform_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out) {
+  phi::UniformRandomRawKernel<T>(dev_ctx,
+                                 shape,
+                                 dtype,
+                                 min,
+                                 max,
+                                 seed,
+                                 diag_num,
+                                 diag_step,
+                                 diag_val,
+                                 out->mutable_value());
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out) {
+  phi::UniformRandomKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::UniformRandomRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::UniformRandomKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+PD_REGISTER_KERNEL(uniform_random_raw_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::UniformRandomRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(uniform_random_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::UniformRandomKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..aee7a4c7aaf62d9250246f326ca009268af92d5e
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selu_grad_kernel.h b/paddle/phi/kernels/selu_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..42cde6deabe1c27c42397ec97221ff6790c8ed7a
--- /dev/null
+++ b/paddle/phi/kernels/selu_grad_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out,
+                    const DenseTensor& d_out,
+                    float scale,
+                    float alpha,
+                    DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/selu_kernel.h b/paddle/phi/kernels/selu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd5d27e98ccc109a07c3ea170d1d5953757b2152
--- /dev/null
+++ b/paddle/phi/kernels/selu_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                float scale,
+                float alpha,
+                DenseTensor* out);
+}  // phi
diff --git a/paddle/phi/kernels/set_value_kernel.h b/paddle/phi/kernels/set_value_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..271691b1a3596f773cb67d605c76d6b8f142c1b6
--- /dev/null
+++ b/paddle/phi/kernels/set_value_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sgd_kernel.h b/paddle/phi/kernels/sgd_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..12361c738e247b6d2f3fc4813cc8ad89da3e8bb7
--- /dev/null
+++ b/paddle/phi/kernels/sgd_kernel.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out);
+
+}  // namespace phi
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.cc b/paddle/phi/kernels/shape_kernel.h
similarity index 75%
rename from paddle/infrt/kernel/phi/allocator_kernels.cc
rename to paddle/phi/kernels/shape_kernel.h
index eba12e688b4ae2cf9bdd4fa46bb479be882b02fc..444c481812e88dfb6413d8af51dca4f1b704efbf 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.cc
+++ b/paddle/phi/kernels/shape_kernel.h
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/phi/kernels/shard_index_kernel.h b/paddle/phi/kernels/shard_index_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..54ad9a14fa023e41d5261cfe9f03039a1fa031ec
--- /dev/null
+++ b/paddle/phi/kernels/shard_index_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bc75b7670fcc2100908af2543455964be324b54
--- /dev/null
+++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ea3e6589f7ed070eb6229a24fb75e7444e94151
--- /dev/null
+++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sign_kernel.h b/paddle/phi/kernels/sign_kernel.h
index 7ee1145012dbd94d077bc229e0ffb8e833eb52a4..4b5900d90f45daa01c117b9f1649a152734c5b76 100644
--- a/paddle/phi/kernels/sign_kernel.h
+++ b/paddle/phi/kernels/sign_kernel.h
@@ -25,7 +25,7 @@ void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   SignKernel<T, Context>(dev_ctx, x, &dense_out);
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index 3e4a968b7a8a569fc518366c321a57e2738bf12a..eaea6d952167c149f8add498768b26dd0d54f16a 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..42bde442e1e063a355d2eabb2963865a2ff45bcb
--- /dev/null
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      const bool subm,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad);
+
+template <typename T, typename Context>
+std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
+                                    const SparseCooTensor& x,
+                                    const DenseTensor& rulebook,
+                                    const DenseTensor& kernel,
+                                    const SparseCooTensor& out_grad,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& dilations,
+                                    const std::vector<int>& strides,
+                                    const int groups,
+                                    const bool subm) {
+  DenseTensor x_grad =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  DenseTensor kernel_grad = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout()));
+  // TODO(zhangkaihuo): call InferMeta func here
+  Conv3dGradKernel<T, Context>(dev_ctx,
+                               x,
+                               rulebook,
+                               kernel,
+                               out_grad,
+                               paddings,
+                               dilations,
+                               strides,
+                               groups,
+                               subm,
+                               &x_grad,
+                               &kernel_grad);
+  std::vector<DenseTensor> out(2);
+  out[0] = x_grad;
+  out[1] = kernel_grad;
+  return out;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 71160a6365dc778e40476af960f21443cac698e5..778600a2285de63a481ccd0094cb07a3206b48d9 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -124,6 +125,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook);
 
@@ -135,12 +137,23 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
+                       const bool subm,
                        DenseTensor* rulebook) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
-  Conv3dKernel<T, Context>(
-      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  Conv3dKernel<T, Context>(dev_ctx,
+                           x,
+                           kernel,
+                           paddings,
+                           dilations,
+                           strides,
+                           groups,
+                           subm,
+                           &coo,
+                           rulebook);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index 5803069d927d70947d8bc7c3d6af051d7ea1b81c..a5a946dce7912f706b1b4c149c89331ce9a3744f 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -38,15 +39,13 @@ void ProductRuleBook(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      const DDim& out_dims,
+                     const bool subm,
                      DenseTensor* rulebook,
                      DenseTensor* counter_per_kernel) {
   const auto& kernel_dims = kernel.dims();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
-  dev_ctx.Alloc(counter_per_kernel,
-                counter_per_kernel->dtype(),
-                sizeof(int) * counter_per_kernel->numel());
   int* counter_ptr = counter_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
@@ -61,11 +60,24 @@ void ProductRuleBook(const Context& dev_ctx,
   const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
   const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
 
+  std::set<int> hash_in;
+  if (subm) {
+    for (int i = 0; i < non_zero_num; i++) {
+      int batch = indices_ptr[i];
+      int in_z = indices_ptr[i + non_zero_num];
+      int in_y = indices_ptr[i + 2 * non_zero_num];
+      int in_x = indices_ptr[i + 3 * non_zero_num];
+      int index = PointToIndex<DDim>(batch, in_x, in_y, in_z, x_dims);
+      hash_in.insert(index);
+    }
+  }
+
   auto f_calc_rulebook = [&](int* rulebook_ptr) {
     int kernel_index = 0, rulebook_index = 0;
     for (int kz = 0; kz < kernel_dims[0]; kz++) {
       for (int ky = 0; ky < kernel_dims[1]; ky++) {
         for (int kx = 0; kx < kernel_dims[2]; kx++) {
+          ++kernel_index;
           for (int64_t i = 0; i < non_zero_num; i++) {
             int batch = indices_ptr[i];
             int in_z = indices_ptr[i + non_zero_num];
@@ -85,11 +97,19 @@ void ProductRuleBook(const Context& dev_ctx,
                       kx,
                       ky,
                       kz)) {
+              if (subm) {
+                int out_index =
+                    PointToIndex<DDim>(batch, out_x, out_y, out_z, out_dims);
+                if (hash_in.find(out_index) == hash_in.end()) {
+                  continue;
+                }
+              }
+
               if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index] += 1;
+                counter_ptr[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
-                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index] = kernel_index - 1;
                 rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
                 rulebook_ptr[rulebook_index + rulebook_len * 2] =
                     PointToIndex<DDim>(
@@ -98,7 +118,6 @@ void ProductRuleBook(const Context& dev_ctx,
               }
             }
           }
-          ++kernel_index;
         }
       }
     }
@@ -106,7 +125,9 @@ void ProductRuleBook(const Context& dev_ctx,
 
   f_calc_rulebook(nullptr);
   // alloc the rulebook
-  rulebook->ResizeAndAllocate({3, rulebook_len});
+  DenseTensorMeta rulebook_meta(
+      DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
+  rulebook->set_meta(rulebook_meta);
   dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
   int* rulebook_ptr = rulebook->data<int>();
   f_calc_rulebook(rulebook_ptr);
@@ -135,8 +156,6 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
       x.dtype(), {out_non_zero_num, out_channels}, x.layout());
   phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
   phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-  dev_ctx.Alloc(
-      &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int));
   int* out_indices_ptr = out_indices.data<int>();
   int i = 0;
   for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb414faef6743126cf2e25b49ae17689f0a6048f
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      const bool subm,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  T* in_features_ptr = in_features.data<T>();
+  T* d_x_features_ptr = d_x_features.data<T>();
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->Resize(kernel_dims);
+  dev_ctx.Alloc(
+      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
+
+  int half_kernel_size = kernel_size / 2;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
+
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[rulebook_ptr[i]] += 1;
+  }
+  int offset = 0, max_count = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
+  }
+  offsets[kernel_size] = offset;
+
+  if (subm) {
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              x.non_zero_elements().dims()[1],
+              out_grad.non_zero_elements().dims()[1],
+              x.non_zero_elements().dims()[0],
+              static_cast<T>(1),
+              x.non_zero_elements().data<T>(),
+              out_grad.non_zero_elements().data<T>(),
+              static_cast<T>(0),
+              d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    T* x_grad_ptr = x_grad->data<T>();
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              out_grad.non_zero_elements().dims()[0],
+              in_channels,
+              out_grad.non_zero_elements().dims()[1],
+              static_cast<T>(1),
+              out_grad.non_zero_elements().data<T>(),
+              kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+              static_cast<T>(0),
+              x_grad_ptr);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  Scatter<T>(d_x_features_ptr,
+             rulebook.data<int>() + rulebook_len,
+             rulebook_len,
+             in_channels,
+             x_grad_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index fdf255bd542e66245b44b2ec906dc207ee51a422..f65e1cf579a9344b6e46ff693b1ce05600adc6a0 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
 namespace sparse {
@@ -36,6 +35,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -55,7 +55,6 @@ void Conv3dKernel(const Context& dev_ctx,
   // 1. product rulebook
   DenseTensorMeta counter_meta(
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
 
   ProductRuleBook<T, Context>(dev_ctx,
@@ -65,6 +64,7 @@ void Conv3dKernel(const Context& dev_ctx,
                               dilations,
                               strides,
                               out_dims,
+                              subm,
                               rulebook,
                               &counter_per_kernel);
 
@@ -83,8 +83,6 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(in_features_meta));
   phi::DenseTensor out_features =
       phi::Empty(dev_ctx, std::move(out_features_meta));
-  dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel());
-  dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel());
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
 
@@ -130,9 +128,6 @@ void Conv3dKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  dev_ctx.Alloc(out->mutable_non_zero_elements(),
-                out->mutable_non_zero_elements()->dtype(),
-                sizeof(T) * in_features.numel());
   T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
   memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
   Scatter<T>(out_features_ptr,
diff --git a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f6d24093a4d703d86550ab1847a082823f8af6b
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <set>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/submanifold_convolution_kernel.h"
+
+namespace phi {
+namespace sparse {}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8826fd7cf87e0a7a4a8251b4da823f18190f4a38
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out,
+                              const bool subm = false) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+template <typename Context>
+inline int* SortedAndUniqueIndex(const Context& dev_ctx,
+                                 const int* rulebook_ptr,
+                                 const int len,
+                                 DenseTensor* out_index,
+                                 DenseTensor* unique_key,
+                                 DenseTensor* unique_value) {
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, out_index, kps::IdentityFunctor<int>());
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, unique_value, kps::IdentityFunctor<int>());
+
+  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<int>(),
+                                     rulebook_ptr,
+                                     sizeof(int) * len,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToDevice,
+#else
+                                     cudaMemcpyDeviceToDevice,
+#endif
+                                     dev_ctx.stream());
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key->data<int>(),
+                      unique_key->data<int>() + len,
+                      out_index->data<int>());
+
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key->data<int>(),
+                            unique_key->data<int>() + len,
+                            unique_value->data<int>());
+  return new_end.first;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a307ab0f54613a91deee6215b5c389ca0a44d6e8
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -0,0 +1,248 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook[3, rulebook_len]:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      const bool subm,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  T* in_features_ptr = in_features.data<T>();
+  T* d_x_features_ptr = d_x_features.data<T>();
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->ResizeAndAllocate(kernel_dims);
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+
+  int half_kernel_size = kernel_size / 2;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(rulebook_len, 0);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  int offset = 0, max_count = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
+  }
+  offsets[kernel_size] = offset;
+
+  if (subm) {
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              x.non_zero_elements().dims()[1],
+              out_grad.non_zero_elements().dims()[1],
+              x.non_zero_elements().dims()[0],
+              static_cast<T>(1),
+              x.non_zero_elements().data<T>(),
+              out_grad.non_zero_elements().data<T>(),
+              static_cast<T>(0),
+              d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    T* x_grad_ptr = x_grad->data<T>();
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              out_grad.non_zero_elements().dims()[0],
+              in_channels,
+              out_grad.non_zero_elements().dims()[1],
+              static_cast<T>(1),
+              out_grad.non_zero_elements().data<T>(),
+              kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+              static_cast<T>(0),
+              x_grad_ptr);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(
+      out_grad.non_zero_elements().data<T>(),
+      rulebook_ptr + rulebook_len * 2,
+      out_grad_features_ptr,
+      rulebook_len,
+      out_channels);
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  SortedAndUniqueIndex(dev_ctx,
+                       rulebook_ptr + rulebook_len,
+                       rulebook_len,
+                       &out_index,
+                       &unique_key,
+                       &unique_value);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(d_x_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         x.nnz(),
+                                         rulebook_len,
+                                         in_channels,
+                                         x_grad_values_ptr,
+                                         subm);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94186600f1e2994f9b464bb8d81e9dbf891a4ae9
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -0,0 +1,678 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              int* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  int* out_indices,
+                                  int* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+__global__ void ProductRuleBookKernel(const int* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      const bool subm,
+                                      int* rulebook,
+                                      int* counter,
+                                      int* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (Check(x_dims,
+                    kernel_dims,
+                    paddings,
+                    dilations,
+                    strides,
+                    in_x,
+                    in_y,
+                    in_z,
+                    kx,
+                    ky,
+                    kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index =
+                PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// brief: calculation the distance between start and end
+__global__ void DistanceKernel(const int* start,
+                               const int* end,
+                               int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const DenseTensor& kernel,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          kernel_size * sizeof(int),
+                          dev_ctx.stream()>>>(indices_ptr,
+                                              d_x_dims,
+                                              d_kernel_dims,
+                                              d_out_dims,
+                                              non_zero_num,
+                                              d_paddings,
+                                              d_dilations,
+                                              d_strides,
+                                              subm,
+                                              rulebook_ptr,
+                                              counter_ptr,
+                                              in_indexs.data<int>());
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
+                             -1);
+
+  DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+
+    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<<<config.block_per_grid.x,
+                                    config.thread_per_block,
+                                    kernel_size * sizeof(int),
+                                    dev_ctx.stream()>>>(val_result.data<int>(),
+                                                        len,
+                                                        rulebook_len,
+                                                        kernel_size,
+                                                        rulebook_ptr,
+                                                        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  rulebook->Resize({rulebook_rows, rulebook_len});
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  int* out_index_ptr = out_index->data<int>();
+  int* unique_value_ptr = unique_value->data<int>();
+  int* unique_key_ptr = unique_key->data<int>();
+
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<<<1, 1>>>(unique_key_ptr,
+                           new_end,
+                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(unique_key_ptr,
+                                          unique_value_ptr,
+                                          out_index_ptr,
+                                          out_non_zero_num,
+                                          rulebook_len,
+                                          d_out_dims,
+                                          out_indices_ptr,
+                                          rulebook_ptr + 2 * rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  const bool subm,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  out->set_dims(out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensorMeta offsets_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    auto kernel_dims = kernel.dims();
+    for (int i = 0; i < paddings.size(); i++) {
+      subm_paddings[i] = kernel_dims[i] / 2;
+      subm_strides[i] = 1;
+    }
+  }
+
+  int n = ProductRuleBook<T, Context>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      subm_paddings,
+                                      dilations,
+                                      subm_strides,
+                                      out_dims,
+                                      subm,
+                                      rulebook,
+                                      &counter_per_kernel,
+                                      &offsets_per_kernel,
+                                      &out_index,
+                                      &unique_key,
+                                      &unique_value,
+                                      out,
+                                      &h_counter,
+                                      &offsets);
+
+  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* offsets_ptr = counter_per_kernel.data<int>();
+  const int* rulebook_ptr = rulebook->data<int>();
+
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  T* in_features_ptr = in_features.data<T>();
+  T* out_features_ptr = out_features.data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
+
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + n,
+                                             in_features_ptr,
+                                             n,
+                                             in_channels);
+
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto* out_values = out->mutable_non_zero_elements();
+  T* out_values_ptr = out_values->data<T>();
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (h_counter[i] <= 0) {
+      continue;
+    }
+
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = h_counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+
+  // 4. scatter
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, out->nnz() * out_channels, 1);
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(out_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         out->nnz(),
+                                         n,
+                                         out_channels,
+                                         out_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d96d134a26b08a0208122a7ea9a62ce07c033d51..c83b2130ed4550540a98148aec26e42332c8060d 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
@@ -63,8 +64,8 @@ template <typename T, typename Context>
 SparseCooTensor DenseToSparseCoo(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const int64_t sparse_dim) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   return coo;
@@ -78,8 +79,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCooTensor SparseCsrToCoo(const Context& dev_ctx,
                                const SparseCsrTensor& x) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
@@ -93,9 +94,9 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCsrTensor SparseCooToCsr(const Context& dev_ctx,
                                const SparseCooTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   SparseCooToCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -113,8 +114,8 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
                     phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   SparseCooToCsrKernel<T, Context>(dev_ctx, coo, out);
@@ -122,9 +123,9 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 SparseCsrTensor DenseToSparseCsr(const Context& dev_ctx, const DenseTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   DenseToSparseCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -148,8 +149,8 @@ template <typename T, typename Context>
 void SparseCsrToDenseKernel(const Context& dev_ctx,
                             const SparseCsrTensor& x,
                             DenseTensor* out) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   SparseCooToDenseKernel<T, Context>(dev_ctx, coo, out);
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 1e730d809bc3a225d8dc34d24bde48f857b7ca9a..e42b25e60c42291b3949c144c4aba6f62ec98a44 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -43,18 +43,18 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
   }
 
   std::vector<MetaTensor> out_meta;
+  std::vector<MetaTensor*> out_meta_ptr;
   out_meta.reserve(out_number);
+  out_meta_ptr.reserve(out_number);
   std::vector<DenseTensor> result;
   result.reserve(out_number);
 
   for (size_t i = 0; i < out_number; ++i) {
-    auto dense_out = phi::Empty<T, Context>(dev_ctx);
-    MetaTensor tmp_meta(&dense_out);
-
-    result.push_back(dense_out);
-    out_meta.push_back(&result.back());
+    result.emplace_back(DenseTensor());
+    out_meta.emplace_back(&result.back());
+    out_meta_ptr.push_back(&out_meta.back());
   }
-  SplitInferMeta(x, num_or_sections, axis, &out_meta);
+  SplitInferMeta(x, num_or_sections, axis, out_meta_ptr);
 
   std::vector<DenseTensor*> outs;
   outs.reserve(out_meta.size());
diff --git a/paddle/phi/kernels/take_along_axis_grad_kernel.h b/paddle/phi/kernels/take_along_axis_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a312c235f66fc6f58f7bbb2a29f78d8ff4b427eb
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/take_along_axis_kernel.h b/paddle/phi/kernels/take_along_axis_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8fb78556d9bbe9c6ceced660a03c7ddc3dd1dc6
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/tile_grad_kernel.h b/paddle/phi/kernels/tile_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..830276c28e05395c36854426e560fbe6f2642589
--- /dev/null
+++ b/paddle/phi/kernels/tile_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tile_kernel.h b/paddle/phi/kernels/tile_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..924d0149fe34598753f0d02cfa3b087e2e7493f2
--- /dev/null
+++ b/paddle/phi/kernels/tile_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_grad_kernel.h b/paddle/phi/kernels/top_k_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f577b982c575dc7c158ade047bc2668cec358725
--- /dev/null
+++ b/paddle/phi/kernels/top_k_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_kernel.h b/paddle/phi/kernels/top_k_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fea76e448b5438634b8475c4fb783688d62c4905
--- /dev/null
+++ b/paddle/phi/kernels/top_k_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..33d4ca7e3c6c2fd11482d6fac8d98f3acd1aeb65
--- /dev/null
+++ b/paddle/phi/kernels/transpose_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8d7fbaa2757d76db1005ce57498d181046d77c9
--- /dev/null
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Transpose(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int>& axis) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  TransposeInferMeta(x, axis, &meta_out);
+  TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor TransposeLast2Dim(const Context& dev_ctx, const DenseTensor& x) {
+  size_t rank = x.dims().size();
+  std::vector<int> axis(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    axis[i] = i;
+  }
+  std::swap(axis[rank - 1], axis[rank - 2]);
+  return Transpose<T, Context>(dev_ctx, x, axis);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb5a5ab461a1dcbbdec916dff57e65df5d9cfd9b
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_kernel.h b/paddle/phi/kernels/triangular_solve_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..833de3f8439ee843577306ff146fa01ad4225390
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
index 0370cc431fef9cab69861b7f707f65c897e20fa2..f8547ced41934a9810dc6874c090ab5aefd43497 100644
--- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace phi {
 
@@ -157,8 +158,8 @@ struct TruncatedNormal {
 };
 
 template <typename T, typename Context>
-void TruncatedGaussianRandomKernel(const Context& ctx,
-                                   const ScalarArray& shape,
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..36ce4c3f9eef58a55d41e0230c3ebfc3e768e6b3
--- /dev/null
+++ b/paddle/phi/kernels/uniform_random_kernel.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/viterbi_decode_kernel.h b/paddle/phi/kernels/viterbi_decode_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..27eb94d89cec4a30a056490b40c13332fd808711
--- /dev/null
+++ b/paddle/phi/kernels/viterbi_decode_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_grad_kernel.h b/paddle/phi/kernels/where_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a3c66ee6ed8403d0b453ed38d21e4beed02661c
--- /dev/null
+++ b/paddle/phi/kernels/where_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_index_kernel.h b/paddle/phi/kernels/where_index_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..68b094637c8d55ba424f61f3afea642db073c13f
--- /dev/null
+++ b/paddle/phi/kernels/where_index_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_kernel.h b/paddle/phi/kernels/where_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..254271ac9c7238c66d09ffe41d12e29fe8f23237
--- /dev/null
+++ b/paddle/phi/kernels/where_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 574f4e991a260e8ebc250fe3f8461736dc3eb7f8..d43126d56e88c868d4e273aaf13bd71bc570d37c 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype,
                 DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
   FullValueXPU<T>(dev_ctx, out, val.to<T>());
 }
 
@@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx,
                     const Scalar& val,
                     DataType dtype,
                     DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
   auto value = val.to<float>();
   using XPUInTDType = typename XPUTypeTrait<T>::Type;
   using CommonType = typename std::common_type<
diff --git a/paddle/phi/kernels/xpu/where_index_kernel.cc b/paddle/phi/kernels/xpu/where_index_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6653e57f6eadf3e359a9281ce5737277d83b206
--- /dev/null
+++ b/paddle/phi/kernels/xpu/where_index_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
+  int true_num_cpu;
+  int ret = xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      XPU_SUCCESS,
+      phi::errors::External(
+          "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
+          ret,
+          XPUAPIErrorMsg[ret]));
+
+  paddle::memory::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&true_num_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(true_num),
+                       sizeof(int32_t));
+
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
+  auto* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num_cpu == 0) {
+    return;
+  }
+
+  auto condition_shape = phi::vectorize<int>(dims);
+  ret = xpu::where(
+      dev_ctx.x_context(), cond_data, out_data, condition_shape, true_num_cpu);
+  PADDLE_ENFORCE_EQ(ret,
+                    XPU_SUCCESS,
+                    phi::errors::External(
+                        "XPU masked_select kernel return wrong value[%d %s]",
+                        ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where_index, XPU, ALL_LAYOUT, phi::WhereIndexKernel, int, bool, float) {}
diff --git a/paddle/phi/kernels/yolo_box_kernel.h b/paddle/phi/kernels/yolo_box_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9553d300cad2b78719958077aea297a500c9359a
--- /dev/null
+++ b/paddle/phi/kernels/yolo_box_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..396830ca20765bc24d9ddc0e9d09ef045d376dfc
--- /dev/null
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+#define DefineActGradDepXOpArgMap(func_name, op_name)                        \
+  KernelSignature func_name##GradOpArgumentMapping(                          \
+      const ArgumentMappingContext& ctx) {                                   \
+    return KernelSignature(                                                  \
+        op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
+  }
+
+#define DefineActGradDepOutOpArgMap(func_name, op_name)                        \
+  KernelSignature func_name##GradOpArgumentMapping(                            \
+      const ArgumentMappingContext& ctx) {                                     \
+    return KernelSignature(                                                    \
+        op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
+  }
+
+KernelSignature ReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
+}
+
+DefineActGradDepXOpArgMap(Cos, "cos");
+DefineActGradDepXOpArgMap(Tan, "tan");
+DefineActGradDepXOpArgMap(Acos, "acos");
+DefineActGradDepXOpArgMap(Sin, "sin");
+DefineActGradDepXOpArgMap(Asin, "asin");
+DefineActGradDepXOpArgMap(Atan, "atan");
+DefineActGradDepXOpArgMap(Sinh, "sinh");
+DefineActGradDepXOpArgMap(Cosh, "cosh");
+DefineActGradDepXOpArgMap(Asinh, "asinh");
+DefineActGradDepXOpArgMap(Acosh, "acosh");
+DefineActGradDepXOpArgMap(Atanh, "atanh");
+DefineActGradDepOutOpArgMap(Relu, "relu");
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
+                           phi::ReluDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62133a441ff126af9c1d65cf7c8af6f6571d8b32
--- /dev/null
+++ b/paddle/phi/ops/compat/argsort_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ArgsortGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("argsort_grad",
+                         {"Indices", "X", GradVarName("Out")},
+                         {"axis", "descending"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..011d4c12ecefc5b69eec4bf15425aaa648666159
--- /dev/null
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature BatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "batch_norm_grad",
+      {GradVarName("Y"),
+       "X",
+       "Scale",
+       "Bias",
+       "SavedMean",
+       "SavedVariance",
+       "ReserveSpace",
+       "Mean",
+       "Variance"},
+      {"momentum",
+       "epsilon",
+       "data_layout",
+       "is_test",
+       "use_global_stats",
+       "trainable_statistics",
+       "fuse_with_relu"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+KernelSignature BatchNormGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm_grad_grad",
+                         {"DDX",
+                          "DDScale",
+                          "DDBias",
+                          "DY",
+                          "X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "Mean",
+                          "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"DX", "DScale", "DDY"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
+                           phi::BatchNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
+                           phi::BatchNormGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/bincount_sig.cc b/paddle/phi/ops/compat/bincount_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35067c256ed495b0e2156bef87c943334e0ef61f
--- /dev/null
+++ b/paddle/phi/ops/compat/bincount_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c979c4aedcc88c3b6bc6664de9ae3175272eec6
--- /dev/null
+++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BroadcastTensorsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad,
+                           phi::BroadcastTensorsGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cholesky_solve_sig.cc b/paddle/phi/ops/compat/cholesky_solve_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a9759f8352a0e073fcd282c6ec40f73adea9e7f
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskySolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky_solve_grad,
+                           phi::CholeskySolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/compare_sig.cc b/paddle/phi/ops/compat/compare_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..964c7be3db3cff7ce79cf5ec619fc6d26fb99e0e
--- /dev/null
+++ b/paddle/phi/ops/compat/compare_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LessThanArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("less_than", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature LessEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("less_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature GreaterThanArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("greater_than", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature GreaterEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("greater_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature EqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature NotEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("not_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature EqualAllArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("equal_all", {"X", "Y"}, {}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(less_than, phi::LessThanArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(less_equal, phi::LessEqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(greater_than, phi::GreaterThanArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(greater_equal, phi::GreaterEqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(equal, phi::EqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(not_equal, phi::NotEqualArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(equal_all, phi::EqualAllArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a755fdb19ec4b86d4b5265c1d6bce5eecdb9b5b3
--- /dev/null
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+
+KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad,
+                           phi::Conv2dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a036afac82a8d49455b1a226e62f5fe757d4b4b9
--- /dev/null
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+
+KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv3dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad,
+                           phi::Conv3dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2b6801f73bcdb0f20090e3ea7e75b7257bde4e3
--- /dev/null
+++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DepthwiseConv2dOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"Output"});
+}
+
+KernelSignature DepthwiseConv2dGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d,
+                           phi::DepthwiseConv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad,
+                           phi::DepthwiseConv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad,
+                           phi::DepthwiseConv2dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc
index fa693f92c6fe3ade527953b632bf94cf4c1b10c1..12ef3056f1e680398e7ded901e72ed201d2f4a17 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/ops/compat/digamma_sig.cc
@@ -19,7 +19,7 @@ namespace phi {
 KernelSignature DigammaGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "digamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+      "digamma_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18a30b9b840483d7df3b3b009d079aea35a7d6bc
--- /dev/null
+++ b/paddle/phi/ops/compat/dist_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("dist_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"p"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bf229c98bd07face5a5ba7778318cf1662f29a9
--- /dev/null
+++ b/paddle/phi/ops/compat/dropout_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DropoutOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "dropout",
+      {"X", "Seed"},
+      {"dropout_prob", "is_test", "dropout_implementation", "seed", "fix_seed"},
+      {"Out", "Mask"});
+}
+
+KernelSignature DropoutGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("dropout_grad",
+                         {"Mask", GradVarName("Out")},
+                         {"dropout_prob", "is_test", "dropout_implementation"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dropout, phi::DropoutOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_grad, phi::DropoutGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/eigh_sig.cc b/paddle/phi/ops/compat/eigh_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e50a9a5a12a56493b8b81a1eacdb12051a3c362f
--- /dev/null
+++ b/paddle/phi/ops/compat/eigh_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("eigh_grad",
+                         {"Eigenvalues",
+                          "Eigenvectors",
+                          GradVarName("Eigenvalues"),
+                          GradVarName("Eigenvectors")},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index cddebcbce1273a2f88d7b1ce50f1c340d313ecf0..fc890fa3a4923aaf452af20fd586c82d506ea1a7 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -100,6 +100,53 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+}
+
+KernelSignature ElementwiseDivGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_double_grad",
+                         {"Y", "Out", "DX", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("Y"), "DOut", "DDOut"});
+}
+
+KernelSignature ElementwiseMulGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseMulDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_double_grad",
+                         {"X", "Y", "DOut", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y"), "DDOut"});
+}
+
+KernelSignature ElementwiseMulTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiply_triple_grad",
+      {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"},
+      {"axis"},
+      {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -110,6 +157,12 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -127,3 +180,15 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
                            phi::ElementwiseAddTripleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad,
+                           phi::ElementwiseSubDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
+                           phi::ElementwiseDivGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
+                           phi::ElementwiseDivDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad,
+                           phi::ElementwiseMulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
+                           phi::ElementwiseMulDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
+                           phi::ElementwiseMulTripleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/erf_sig.cc b/paddle/phi/ops/compat/erf_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..784727a98042db820b4f83bac84014ebd0e1302e
--- /dev/null
+++ b/paddle/phi/ops/compat/erf_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "erf_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(erf_grad, phi::ErfGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/expand_as_sig.cc b/paddle/phi/ops/compat/expand_as_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a616b63c10b3c6ef9bca8c906655da99e8912244
--- /dev/null
+++ b/paddle/phi/ops/compat/expand_as_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ExpandAsOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"});
+}
+
+KernelSignature ExpandAsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as_grad",
+                         {"X", GradVarName("Out")},
+                         {"target_shape"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as);
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad,
+                           phi::ExpandAsGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gather_scatter_sig.cc b/paddle/phi/ops/compat/gather_scatter_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f71e30f85b09df041b02fbd4f34b69c0e85f92da
--- /dev/null
+++ b/paddle/phi/ops/compat/gather_scatter_sig.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GatherNdGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gather_nd_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {},
+                         {GradVarName("X")});
+}
+
+KernelSignature ScatterGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("scatter_grad",
+                         {"Ids", "Updates", GradVarName("Out")},
+                         {"overwrite"},
+                         {GradVarName("X"), GradVarName("Updates")});
+}
+
+KernelSignature ScatterNdAddGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("scatter_nd_add_grad",
+                         {"Index", "Updates", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Updates")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gather_nd_grad, phi::GatherNdGradArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scatter_grad, phi::ScatterGradArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add_grad,
+                           phi::ScatterNdAddGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f2b157e4c0f950c75ce2d8a66455127f51752fa
--- /dev/null
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+  if (ctx.InputSize("ShapeTensorList") > 0) {
+    // Infer output shape by Attr("shape") in CompileTime if it is specified.
+    if (!ctx.IsRuntime() && !shape.empty()) {
+      return KernelSignature("gaussian_random",
+                             {},
+                             {"shape", "mean", "std", "seed", "dtype"},
+                             {"Out"});
+    } else {
+      return KernelSignature(
+          "gaussian_random",
+          {},
+          {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+          {"Out"});
+    }
+  }
+
+  if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensor", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+
+  return KernelSignature("gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
+                           phi::GaussianRandomOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dacb8b25a89f9c151198f573c3fc4fda37e2939e
--- /dev/null
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GraphSendRecvGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "graph_send_recv_grad",
+      {GradVarName("Out"), "X", "Out", "Src_index", "Dst_index", "Dst_count"},
+      {"pool_type"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
+                           phi::GraphSendRecvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d2aed68a72a5e3aa762b4adbcf7c6e39869b927
--- /dev/null
+++ b/paddle/phi/ops/compat/index_sample_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IndexSampleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_sample_grad",
+                         {GradVarName("Out"), "X", "Index"},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(index_sample_grad,
+                           phi::IndexSampleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isfinite_sig.cc b/paddle/phi/ops/compat/isfinite_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..218b4c2f962c48fda233d290e0bb1bdb667a724d
--- /dev/null
+++ b/paddle/phi/ops/compat/isfinite_sig.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf);
+PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan);
+PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite);
diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4ae746e975a7ef7fe3de26cbe16aa221bca8164
--- /dev/null
+++ b/paddle/phi/ops/compat/log_loss_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("log_loss_grad",
+                         {"Predicted", "Labels", GradVarName("Loss")},
+                         {"epsilon"},
+                         {GradVarName("Predicted")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matrix_power_sig.cc b/paddle/phi/ops/compat/matrix_power_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c9ad4e74ab460c905f5c9e11f64cf8fa332dad0
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_power_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MatrixPowerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("matrix_power_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"n"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad,
+                           phi::MatrixPowerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d16dd1c8617fe224c3e3fb7ab8dfc6cb5b2d2d63
--- /dev/null
+++ b/paddle/phi/ops/compat/maxout_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"});
+}
+
+KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"groups", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..598cbd980f3cc5ece0168b9cbb4f91e654f3f8ab
--- /dev/null
+++ b/paddle/phi/ops/compat/multi_dot_sig.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiDotGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc
index ab0d31ee31dab99125a28d5b9f662d25d8e408d0..0012f8e1ccb41169175b9f539b839850bd901b82 100644
--- a/paddle/phi/ops/compat/mv_sig.cc
+++ b/paddle/phi/ops/compat/mv_sig.cc
@@ -16,10 +16,6 @@
 
 namespace phi {
 
-KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"});
-}
-
 KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("mv_grad",
                          {"X", "Vec", GradVarName("Out")},
@@ -29,5 +25,4 @@ KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f274d7f77c5c0aac853263b28cf24076944f4fd9
--- /dev/null
+++ b/paddle/phi/ops/compat/nll_loss_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  // TODO(xiongkun): can't remove the forward mapping, because the Weight is
+  // optional
+  return KernelSignature("nll_loss",
+                         {"X", "Label", "Weight"},
+                         {"ignore_index", "reduction"},
+                         {"Out", "Total_weight"});
+}
+
+KernelSignature NllLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nll_loss_grad",
+      {"X", "Label", "Total_weight", "Weight", GradVarName("Out")},
+      {"ignore_index", "reduction"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc
index 81d294b84248578d5f29b3b2d432e81f3499e9fa..a74db9b5686c8d79c4a59bb55a33264443ddf886 100644
--- a/paddle/phi/ops/compat/norm_sig.cc
+++ b/paddle/phi/ops/compat/norm_sig.cc
@@ -23,7 +23,7 @@ KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("norm_grad",
-                         {GradVarName("Out"), "X", "Norm"},
+                         {"X", "Norm", GradVarName("Out")},
                          {"axis", "epsilon", "is_test"},
                          {GradVarName("X")});
 }
diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/phi/ops/compat/pad_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4eadbfa98beded121c4e6738384487a9ec10be42
--- /dev/null
+++ b/paddle/phi/ops/compat/pad_sig.cc
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pad_grad",
+                         {GradVarName("Out")},
+                         {"paddings", "pad_value"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d694d9a7759d9e3cdf0c385164a367260f2a020
--- /dev/null
+++ b/paddle/phi/ops/compat/psroi_pool_sig.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool",
+      {"X", "ROIs", "RoisNum"},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {"Out"});
+}
+
+KernelSignature PsroiPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool_grad",
+      {"X", "ROIs", "RoisNum", GradVarName("Out")},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad,
+                           phi::PsroiPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f8dc1cf4cd711c1d39c0730a2a8b4ab86c57bea
--- /dev/null
+++ b/paddle/phi/ops/compat/put_along_axis_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis",
+                         {"Input", "Index", "Value"},
+                         {"Axis", "Reduce"},
+                         {"Result"});
+}
+
+KernelSignature PutAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis", "Reduce"},
+                         {GradVarName("Input"), GradVarName("Value")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad,
+                           phi::PutAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 74704671f8b5d244b2c3b07ada5e592a8c64da27..997f1505bd08d991aa3f13f1ad831c0107664b2f 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -17,36 +17,82 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
   if (ctx.IsDenseTensorInput("X")) {
-    if (!reduce_all) {
-      return KernelSignature(
-          "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"});
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "sum_raw" KernelSignature.
+    // And the InferMeta function(i.e. SumRawInferMeta) is accordance with
+    // the "sum_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature("sum_raw",
+                             {"X"},
+                             {"dim", "keep_dim", "reduce_all", "out_dtype"},
+                             {"Out"});
     }
-    return KernelSignature("sum_raw",
-                           {"X"},
-                           {"dim", "keep_dim", "reduce_all", "out_dtype"},
-                           {"Out"});
+    return KernelSignature(
+        "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"});
   }
   return KernelSignature("unregistered", {}, {}, {});
 }
 
 KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
   if (ctx.IsDenseTensorInput("X")) {
-    if (!reduce_all) {
-      return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"});
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "mean_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
+    // "mean_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
     }
-    return KernelSignature(
-        "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"});
   }
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+}
+
+KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
+    // "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceSumGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sum_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
+                           phi::ReduceSumGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97646a2ac31d33fe8b0bd09c1a205122b4f3fd6c
--- /dev/null
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SegmentPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "segment_pool_grad",
+      {
+          "X", "SegmentIds", "Out", "SummedIds", GradVarName("Out"),
+      },
+      {"pooltype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad,
+                           phi::SegmentPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/selu_sig.cc b/paddle/phi/ops/compat/selu_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23f5cc34515b4aba482e2cfe3d6e0d148e2d97b2
--- /dev/null
+++ b/paddle/phi/ops/compat/selu_sig.cc
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SeluGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("selu_grad",
+                         {"Out", GradVarName("Out")},
+                         {"scale", "alpha"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(selu_grad, phi::SeluGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eacfff26d53cf1ea73c33e4c603253c58be60222
--- /dev/null
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -0,0 +1,736 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Input")) {
+    if (ctx.HasInput("StartsTensorList")) {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    } else {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/sgd_sig.cc b/paddle/phi/ops/compat/sgd_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cdf1a221f7ec2aa24ead046ce9e05724c4278d38
--- /dev/null
+++ b/paddle/phi/ops/compat/sgd_sig.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SGDOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Grad")) {
+    return KernelSignature("sgd",
+                           {"Param", "LearningRate", "Grad", "MasterParam"},
+                           {"multi_precision"},
+                           {"ParamOut", "MasterParamOut"});
+  } else if (ctx.IsSelectedRowsInput("Grad")) {
+    if (ctx.IsDenseTensorInput("Param")) {
+      return KernelSignature("sgd_dense_param_sparse_grad",
+                             {"Param", "LearningRate", "Grad", "MasterParam"},
+                             {"multi_precision"},
+                             {"ParamOut", "MasterParamOut"});
+    } else {
+      return KernelSignature("sgd_sparse_param_sparse_grad",
+                             {"Param", "LearningRate", "Grad", "MasterParam"},
+                             {"multi_precision"},
+                             {"ParamOut", "MasterParamOut"});
+    }
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(sgd, phi::SGDOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61ad9627a9612d62318939af8efda3a541cfa606
--- /dev/null
+++ b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_cross_entropy_with_logits_grad",
+                         {"X", "Label", GradVarName("Out")},
+                         {"normalize", "ignore_index"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(
+    sigmoid_cross_entropy_with_logits_grad,
+    phi::SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27a996a270ddf4a36ad694836f45cb304f9a8f4c
--- /dev/null
+++ b/paddle/phi/ops/compat/take_along_axis_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TakeAlongAxisArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"});
+}
+
+KernelSignature TakeAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("take_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis"},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad,
+                           phi::TakeAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49a6d02225d931f1dc2d3324cb13c2c620f5dfe6
--- /dev/null
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
+  } else {
+    return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+  }
+}
+
+KernelSignature TileGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"RepeatTimes"},
+                           {GradVarName("X")});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times_tensor"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tile, phi::TileOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tile_grad, phi::TileGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bf922b3d1b58979df25fbccc4d1dfe680f858a4
--- /dev/null
+++ b/paddle/phi/ops/compat/top_k_sig.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("K")) {
+    return KernelSignature(
+        "top_k", {"X"}, {"K", "axis", "largest", "sorted"}, {"Out", "Indices"});
+
+  } else {
+    return KernelSignature(
+        "top_k", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"});
+  }
+}
+
+KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("top_k_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "largest", "sorted"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, top_k);
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, top_k_grad);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopkOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopkGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/phi/ops/compat/transpose_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90961760cfc66822ea766080de725a787627682f
--- /dev/null
+++ b/paddle/phi/ops/compat/transpose_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TransposeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("transpose", {"X"}, {"axis"}, {"Out"});
+}
+
+KernelSignature TransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "transpose_grad", {GradVarName("Out")}, {"axis"}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(transpose2, transpose);
+PD_REGISTER_BASE_KERNEL_NAME(transpose2_grad, transpose_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(transpose2, phi::TransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose2_grad,
+                           phi::TransposeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose, phi::TransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose_grad, phi::TransposeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c56af3e21e53e9ded6d01ad7fdb9c0fb5609ea6c
--- /dev/null
+++ b/paddle/phi/ops/compat/triangular_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TriangularSolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("triangular_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper", "transpose", "unitriangular"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad,
+                           phi::TriangularSolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/phi/ops/compat/uniform_random_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d06d4026f4f5f81a07eee2131b7df7808592132b
--- /dev/null
+++ b/paddle/phi/ops/compat/uniform_random_sig.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UniformRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int diag_num = paddle::any_cast<int>(ctx.Attr("diag_num"));
+  if (ctx.IsDenseTensorOutput("Out")) {
+    if (diag_num) {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature("uniform_random_raw",
+                               {},
+                               {"ShapeTensorList",
+                                "dtype",
+                                "min",
+                                "max",
+                                "seed",
+                                "diag_num",
+                                "diag_step",
+                                "diag_val"},
+                               {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_raw",
+                                 {},
+                                 {"ShapeTensor",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_raw",
+                                 {},
+                                 {"shape",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        }
+      }
+    } else {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature(
+            "uniform_random",
+            {},
+            {"ShapeTensorList", "dtype", "min", "max", "seed"},
+            {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random",
+                                 {},
+                                 {"ShapeTensor", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random",
+                                 {},
+                                 {"shape", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        }
+      }
+    }
+  } else if (ctx.IsSelectedRowsOutput("Out")) {
+    if (diag_num) {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature("uniform_random_raw_sr",
+                               {},
+                               {"ShapeTensorList",
+                                "dtype",
+                                "min",
+                                "max",
+                                "seed",
+                                "diag_num",
+                                "diag_step",
+                                "diag_val"},
+                               {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_raw_sr",
+                                 {},
+                                 {"ShapeTensor",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_raw_sr",
+                                 {},
+                                 {"shape",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        }
+      }
+    } else {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature(
+            "uniform_random_sr",
+            {},
+            {"ShapeTensorList", "dtype", "min", "max", "seed"},
+            {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_sr",
+                                 {},
+                                 {"ShapeTensor", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_sr",
+                                 {},
+                                 {"shape", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/where_grad_sig.cc b/paddle/phi/ops/compat/where_grad_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71984a26d35afd841654d82480c263799bdbf181
--- /dev/null
+++ b/paddle/phi/ops/compat/where_grad_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("where_grad",
+                         {"Condition", "X", "Y", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/yolo_box_sig.cc b/paddle/phi/ops/compat/yolo_box_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb39e72a64f5075908ca9b28d5f685fb0d6b6c9f
--- /dev/null
+++ b/paddle/phi/ops/compat/yolo_box_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("yolo_box",
+                         {"X", "ImgSize"},
+                         {"anchors",
+                          "class_num",
+                          "conf_thresh",
+                          "downsample_ratio",
+                          "clip_bbox",
+                          "scale_x_y",
+                          "iou_aware",
+                          "iou_aware_factor"},
+                         {"Boxes", "Scores"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index cde085423e482e62a280815700ead9a0b6c64262..d998ab9435c027afcae0446fe3d31b3f000f1d15 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,27 +1,29 @@
 if(WITH_ROCM)
-  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 else()
-  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 endif()
 
 cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
 
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
+set(COMMON_API_TEST_DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_split_api SRCS test_split_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index d93f00129b9a14170b979dfd23eb6e292e996ce8..6b9bb7aecefe6fabb2334e6f4d150a3937628f34 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_cast_api.cc b/paddle/phi/tests/api/test_cast_api.cc
index 276a70066ba73f3831c2a4bab0b47566848599f5..5448fb9d424709c34dc2008ef1ad7042c02d6630 100644
--- a/paddle/phi/tests/api/test_cast_api.cc
+++ b/paddle/phi/tests/api/test_cast_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_concat_api.cc b/paddle/phi/tests/api/test_concat_api.cc
index d5a36f56bfa1b34a23e8bc4738fcec04225858a3..824b72b97ac129cc0f011dc2bc7279b4c1b362a9 100644
--- a/paddle/phi/tests/api/test_concat_api.cc
+++ b/paddle/phi/tests/api/test_concat_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(concat, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_conj_api.cc b/paddle/phi/tests/api/test_conj_api.cc
index 9c438de9297fb675fa391e3f32cee4761631e5b6..62a588dff12804391df971681b959d84a9c3ca7b 100644
--- a/paddle/phi/tests/api/test_conj_api.cc
+++ b/paddle/phi/tests/api/test_conj_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index a3c497bd427ae040b33dce241a70ecaafee5fbcc..dd008ff36d50a8e30166d478b916ac180d564c7c 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -19,6 +19,16 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_dot_api.cc b/paddle/phi/tests/api/test_dot_api.cc
index e48004653d638121b2b7e6e7047d3dfb51cd76c1..3fcd4e8a01d126f2d4f0fbaaacb7580b2a6b3163 100644
--- a/paddle/phi/tests/api/test_dot_api.cc
+++ b/paddle/phi/tests/api/test_dot_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index cebf4e003aafa928341481952606f18642881a1c..d4013a788c76cb4f049574ee893320088fe7ac2c 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_empty_api.cc b/paddle/phi/tests/api/test_empty_api.cc
index dc5618f0aae8ad2df85ef6c466457a7c6b303ad5..48adbe1bd2682c3096cb76e24ddc8e2f719a532e 100644
--- a/paddle/phi/tests/api/test_empty_api.cc
+++ b/paddle/phi/tests/api/test_empty_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index 9b434aef811956bc6697193947a4756689c5dba3..bf57574d39093cc81a47f8385097627eb98648af 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_flatten_api.cc b/paddle/phi/tests/api/test_flatten_api.cc
index e1360e8e27bff08397031e15377a1cf4a04c4ec7..f1c8935e266409dfd4aa4f86d59a87c2767b4eb9 100644
--- a/paddle/phi/tests/api/test_flatten_api.cc
+++ b/paddle/phi/tests/api/test_flatten_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index 2a3dd9c7dff62071fcd7dcf18cddcc5946ff7480..e2c324a6775c8f697c5871dcfe10a830244a103c 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -26,6 +26,15 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_double_grad, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_mean_api.cc b/paddle/phi/tests/api/test_mean_api.cc
index 53be1b1e9dc9c29bddad810d740d0f7f05d28aa0..af47f2cd7714a2431b62acdedc61fea5a139bc49 100644
--- a/paddle/phi/tests/api/test_mean_api.cc
+++ b/paddle/phi/tests/api/test_mean_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc
index dc2883c1794e2c986ed5446981b749f5f4dd0bc2..74ed648f3ee6ec4e2b763f4a57cc58ce79fb25b9 100644
--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -16,6 +16,13 @@
 #include "gtest/gtest.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_reshape_api.cc b/paddle/phi/tests/api/test_reshape_api.cc
index 60281a9f4992344e19b8a05a3002d964695de94c..4a857e2d1dcda906ec2be5f1d9a20b51dbb53b22 100644
--- a/paddle/phi/tests/api/test_reshape_api.cc
+++ b/paddle/phi/tests/api/test_reshape_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index 52e8ae630e0e535e68e5fc9b76c1e9aca01ddd2c..a40ecc8485e4a96e96f40d513a153df4cc2eb0c3 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -19,8 +19,13 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/selected_rows.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index 9c0b0fc576ebc7363d9c726a759f077a212c7eb8..05a5563344966e4d0664a83051dc84b3f2da499a 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/tests/api/scale_api.h"
 #include "paddle/phi/tests/core/timer.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_slice_api.cc b/paddle/phi/tests/api/test_slice_api.cc
index c3f5fdcb36d62c926c6afc65bcc1ed9327983c69..ee2ade0229f1f3c9b59b87c2ffb37cf42ec1b531 100644
--- a/paddle/phi/tests/api/test_slice_api.cc
+++ b/paddle/phi/tests/api/test_slice_api.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c4aa164259071667e3d90994759c05454f407ff
--- /dev/null
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/api/include/sparse_api.h"
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT);
+
+template <typename T>
+void TestConv3dBase(const std::vector<int>& indices,
+                    const std::vector<T>& features,
+                    const phi::DDim& x_dims,
+                    const std::vector<T>& kernel,
+                    const phi::DDim& kernel_dims,
+                    const std::vector<int>& correct_out_indices,
+                    const std::vector<T>& correct_out_features,
+                    const phi::DDim& correct_out_dims,
+                    const int non_zero_num,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& dilations,
+                    const float diff = 1e-3) {
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  phi::DenseTensor indices_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+
+  phi::DenseTensor features_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           {non_zero_num, in_channels},
+                           phi::DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  auto x_tensor = std::make_shared<phi::SparseCooTensor>(
+      indices_tensor, features_tensor, x_dims);
+  paddle::experimental::Tensor x(x_tensor);
+
+  auto kernel_tensor = std::make_shared<phi::DenseTensor>(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           kernel_dims,
+                           phi::DataLayout::NHWC));
+  paddle::experimental::Tensor weight(kernel_tensor);
+
+  memcpy(kernel_tensor->mutable_data<T>(paddle::platform::CPUPlace()),
+         kernel.data(),
+         kernel.size() * sizeof(T));
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    auto outs = paddle::experimental::sparse::conv3d(
+        x, weight, paddings, dilations, strides, 1, false);
+
+    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
+        std::get<0>(outs).impl());
+    ASSERT_EQ(correct_out_dims.size(), out->dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out->non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(
+          correct_out_features[i] - out->non_zero_elements().data<T>()[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  }
+}
+
+void TestConv3d(const std::vector<int>& indices,
+                const std::vector<float>& features,
+                const phi::DDim& x_dims,
+                const std::vector<float>& kernel,
+                const phi::DDim& kernel_dims,
+                const std::vector<int>& correct_out_indices,
+                const std::vector<float>& correct_out_features,
+                const phi::DDim& correct_out_dims,
+                const int non_zero_num,
+                const std::vector<int>& paddings,
+                const std::vector<int>& strides,
+                const std::vector<int>& dilations) {
+  // test float
+  TestConv3dBase<float>(indices,
+                        features,
+                        x_dims,
+                        kernel,
+                        kernel_dims,
+                        correct_out_indices,
+                        correct_out_features,
+                        correct_out_dims,
+                        non_zero_num,
+                        paddings,
+                        strides,
+                        dilations);
+}
+
+TEST(API, sparse_conv2d) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  const int in_channels = 1;
+  const int out_channels = 1;
+  phi::DDim x_dims = {1, 1, 5, 5, in_channels};
+  phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  phi::DDim out_dims = {1, 1, 3, 3, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
+
+  std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
+  // 3*3*3=27
+  std::vector<float> kernel = {0.65820312,
+                               0.75048828,
+                               0.21411133,
+                               0.17370605,
+                               0.85546875,
+                               0.53076172,
+                               0.28833008,
+                               0.71044922,
+                               0.00659943};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
+
+  std::vector<float> out_features = {
+      -0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index 819122a9b36650f8aa793b7d47ac2c0b6feb8f8e..8595782be35ab677ef40eb31b8a09237e90f359a 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+
 TEST(API, to_sparse_coo) {
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 0b836a010586d775ced1e7c196b1b5139ac42fc1..1b84e7793cf6a864aaa3d9aa5ae7be8144e68c07 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sum_api.cc b/paddle/phi/tests/api/test_sum_api.cc
index 80620b8e61c57668c8b090af798fa04c0da4779b..9781d70d2b9136071d4dde9a947af14f8d8b8e87 100644
--- a/paddle/phi/tests/api/test_sum_api.cc
+++ b/paddle/phi/tests/api/test_sum_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index d337a0b601a00d6ae0423b7184d4d2c5cc6ef2b8..66c478e4c000150b8bfd50d6cb5b79e90d7a5d80 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -21,6 +21,11 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index fa4ffc84bf587defae06deb18dae283a64206b75..5d6862c368c57bc7dbfba2bc9eab960818c25e05 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -44,6 +44,9 @@ TEST(Backend, OStream) {
   oss << phi::Backend::GPUDNN;
   EXPECT_EQ(oss.str(), "GPUDNN");
   oss.str("");
+  oss << phi::Backend::KPS;
+  EXPECT_EQ(oss.str(), "KPS");
+  oss.str("");
   try {
     oss << phi::Backend::NUM_BACKENDS;
   } catch (const std::exception& exception) {
@@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) {
   EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
   EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
   EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
+  EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS"));
   EXPECT_EQ(static_cast<phi::Backend>(
                 static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
             pexp::StringToBackend("CustomBackend"));
diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc
index c962c68b4d5f2f450c6011933cd3af002f9957d1..5a1b41d796d33efb819c06c48a1205f996b8c90b 100644
--- a/paddle/phi/tests/common/test_data_type.cc
+++ b/paddle/phi/tests/common/test_data_type.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/type_traits.h"
 
 namespace phi {
 namespace tests {
@@ -71,5 +72,20 @@ TEST(DataType, OStream) {
   }
 }
 
+TEST(TypeTraits, Complex) {
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX64),
+            phi::DataType::FLOAT32);
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX128),
+            phi::DataType::FLOAT64);
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::FLOAT32), phi::DataType::FLOAT32);
+
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT32),
+            phi::DataType::COMPLEX64);
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT64),
+            phi::DataType::COMPLEX128);
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::COMPLEX64),
+            phi::DataType::COMPLEX64);
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 5356bac9fbd808f1f75eb13f4406d6d0661e60bd..de9bd7a4d479c88bc747ee8933e65689948a4c3d 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel)
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index d8e42c9d0d8b11d393dbb71776671d9cb50a7715..6fe34a6891a35efb0af7e1785a2341988142be79 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -43,7 +43,7 @@ template <typename T, typename Context>
 void FakeDot(const Context& dev_ctx,
              const phi::DenseTensor& x,
              const phi::DenseTensor& y,
-             const std::vector<phi::DenseTensor>& fake_input_vec,
+             const std::vector<const phi::DenseTensor*>& fake_input_vec,
              bool fake_attr_bool,
              int fake_attr_int,
              float fake_attr_float,
@@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) {
               custom_fake_dot_kernels.end());
 
   // 3.before register
-  auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
-  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name));
+  EXPECT_TRUE(kernels.find(op_name) == kernels.end());
 
-  // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while
-  // registering
+  // mock fake_dot is supported by phi for check while registering
   auto& fake_dot_kernels = kernels[op_name];
 
   EXPECT_TRUE(fake_dot_kernels.find(
@@ -174,7 +172,9 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
+
+  EXPECT_EQ(0, static_cast<int>(custom_fake_dot_kernels.size()));
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
@@ -196,7 +196,7 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // 4.kernel select
-  auto kernel = kernel_factory_instance.SelectKernelOrThrowError(
+  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8));
 
   // 5.prepare parameters for kernel
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index c92e10f8dd74af072bb8836d65898e2fc9a79bcc..317dcce92c8edd1bb76b080cdb578d37eb8b1f58 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -22,3 +22,5 @@ endif()
 if(WITH_ROCM)
     hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function)
 endif()
+
+cc_test(test_cpu_vec SRCS test_cpu_vec.cc DEPS blas cpu_info)
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 55dd6dce1aaaeddf7e1acfd1c458b15042d5a629..7f954085f601cdc0321fb133abd716112367b113 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -53,7 +53,7 @@ TEST(DEV_API, concat) {
     }
   }
 
-  std::vector<phi::DenseTensor> inputs = {dense_x, dense_y};
+  std::vector<const phi::DenseTensor*> inputs = {&dense_x, &dense_y};
 
   // 2. test API
   phi::CPUContext dev_ctx;
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/phi/tests/kernels/test_cpu_vec.cc
similarity index 75%
rename from paddle/fluid/operators/math/cpu_vec_test.cc
rename to paddle/phi/tests/kernels/test_cpu_vec.cc
index 859afec3781ef68b308b584aca8055367632de56..271143f9f6f1e710aa95fa8e9be5821b8d191734 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/phi/tests/kernels/test_cpu_vec.cc
@@ -18,7 +18,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
+
+namespace phi {
+namespace tests {
 
 inline double GetCurrentUS() {
   struct timeval time;
@@ -62,7 +65,9 @@ void ref_relu(const int n, const T* x, T* y) {
 }
 
 template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+void RandomVec(const int n,
+               T* a,
+               const T lower = static_cast<T>(-20.f),
                const T upper = static_cast<T>(20.f)) {
   static unsigned int seed = 100;
   std::mt19937 rng(seed++);
@@ -73,7 +78,8 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
 }
 
 template <typename T>
-void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
+void TestAndBench(const int n,
+                  std::function<void(const int, const T*, T*)> tgt,
                   std::function<void(const int, const T*, T*)> ref) {
   std::vector<T> x(n);
   std::vector<T> ytgt(n), yref(n);
@@ -101,47 +107,48 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
 
 TEST(CpuVecTest, sigmoid) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                        ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>);
   }
   TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, tanh) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
-                        ref_tanh<float>);
+    TestAndBench<float>(
+        sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
   }
   TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
 
 TEST(CpuVecTest, relu) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
-                        ref_relu<float>);
+    TestAndBench<float>(
+        sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
 
 template <typename T>
-void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
+void compare_sum(size_t n,
+                 std::function<void(const size_t, const T*, T*)> tgt,
                  std::function<void(const size_t, const T*, T*)> ref) {
   std::vector<T> x(n);
   T ytgt_data, yref_data;
@@ -155,18 +162,19 @@ void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
 
 TEST(CpuVecTest, vec_sum) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     compare_sum<float>(sz, vec_sum<float>, vec_sum<float, platform::isa_any>);
-    compare_sum<float>(sz, vec_sum<float, platform::avx>,
-                       vec_sum<float, platform::isa_any>);
+    compare_sum<float>(
+        sz, vec_sum<float, platform::avx>, vec_sum<float, platform::isa_any>);
   }
   compare_sum<double>(30U, vec_sum<double>, vec_sum<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_clip(
-    size_t n, T threshold,
+    size_t n,
+    T threshold,
     std::function<void(const size_t, const T, const T*, T*)> tgt,
     std::function<void(const size_t, const T, const T*, T*)> ref) {
   std::vector<T> x(n);
@@ -185,20 +193,23 @@ void compare_clip(
 
 TEST(CpuVecTest, vec_clip) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_clip<float>(sz, -4.f, vec_clip<float>,
-                        vec_clip<float, platform::isa_any>);
-    compare_clip<float>(sz, -1.1f, vec_clip<float, platform::avx>,
+    compare_clip<float>(
+        sz, -4.f, vec_clip<float>, vec_clip<float, platform::isa_any>);
+    compare_clip<float>(sz,
+                        -1.1f,
+                        vec_clip<float, platform::avx>,
                         vec_clip<float, platform::isa_any>);
   }
-  compare_clip<double>(30U, 1.0, vec_clip<double>,
-                       vec_clip<double, platform::isa_any>);
+  compare_clip<double>(
+      30U, 1.0, vec_clip<double>, vec_clip<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_mul(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
+    size_t n,
+    std::function<void(const size_t, const T*, const T*, T*)> tgt,
     std::function<void(const size_t, const T*, const T*, T*)> ref) {
   std::vector<T> x(n), y(n);
   std::vector<T> ztgt(n), zref(n);
@@ -220,18 +231,19 @@ void compare_mul(
 
 TEST(CpuVecTest, vec_mul) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
-    compare_mul<float>(sz, vec_mul<float, platform::avx>,
-                       vec_mul<float, platform::isa_any>);
+    compare_mul<float>(
+        sz, vec_mul<float, platform::avx>, vec_mul<float, platform::isa_any>);
   }
   compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_mul_reduce(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
+    size_t n,
+    std::function<void(const size_t, const T*, const T*, T*)> tgt,
     std::function<void(const size_t, const T*, const T*, T*)> ref) {
   std::vector<T> x(n), y(n);
   T ztgt_data, zref_data;
@@ -249,19 +261,21 @@ void compare_mul_reduce(
 
 TEST(CpuVecTest, vec_mul_reduce) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
-                              vec_mul_reduce<float, platform::isa_any>);
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
+    compare_mul_reduce<float>(
+        sz, vec_mul_reduce<float>, vec_mul_reduce<float, platform::isa_any>);
+    compare_mul_reduce<float>(sz,
+                              vec_mul_reduce<float, platform::avx>,
                               vec_mul_reduce<float, platform::isa_any>);
   }
-  compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
-                             vec_mul_reduce<double, platform::isa_any>);
+  compare_mul_reduce<double>(
+      30U, vec_mul_reduce<double>, vec_mul_reduce<double, platform::isa_any>);
 }
 
 template <typename T>
-void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
+void TestInplace(const int n,
+                 std::function<void(const int, const T*, T*)> tgt,
                  std::function<void(const int, const T*, T*)> ref) {
   std::vector<T> x(n);
   std::vector<T> ytgt(n), yref(n);
@@ -283,22 +297,22 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
 
 TEST(CpuVecTest, inplace_sigmoid) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                       ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>);
   }
   TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, inplace_tanh) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
@@ -310,7 +324,7 @@ TEST(CpuVecTest, inplace_tanh) {
 
 TEST(CpuVecTest, inplace_relu) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
@@ -319,3 +333,5 @@ TEST(CpuVecTest, inplace_relu) {
   }
   TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 576015143704b86957073bcf3f06b381e4b61592..37a69a176c6e1ded81a8449da3c571442bd94e78 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -59,7 +61,11 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const std::vector<int>& paddings,
                     const std::vector<int>& strides,
                     const std::vector<int>& dilations,
-                    const float diff = 1e-3) {
+                    const float diff = 1e-3,
+                    const bool backward = false,
+                    const std::vector<T> features_grad = {},
+                    const std::vector<T> kernel_grad = {},
+                    const bool subm = false) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -73,9 +79,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor indices_tensor = phi::Empty(
       dev_ctx_cpu,
       DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
-  dev_ctx_cpu.Alloc(&indices_tensor,
-                    indices_tensor.dtype(),
-                    sizeof(int) * indices_tensor.numel());
   memcpy(
       indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
   DenseTensor features_tensor = phi::Empty(
@@ -83,9 +86,6 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       {non_zero_num, in_channels},
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(&features_tensor,
-                    features_tensor.dtype(),
-                    features_tensor.numel() * sizeof(T));
   memcpy(
       features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
 
@@ -96,12 +96,18 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       kernel_dims,
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(
-      &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T));
   memcpy(kernel_tensor.data<T>(), kernel.data(), kernel.size() * sizeof(T));
 
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty<int, phi::CPUContext>(dev_ctx_cpu);
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
     SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
                                             x_tensor,
                                             kernel_tensor,
@@ -109,6 +115,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &rulebook);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
@@ -122,12 +129,131 @@ void TestConv3dBase(const std::vector<int>& indices,
                              correct_out_indices.size() * sizeof(int));
     ASSERT_EQ(cmp_indices, 0);
 
-    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
-      float tmp = std::fabs(static_cast<float>(
-          correct_out_features[i] - out.non_zero_elements().data<T>()[i]));
-      ASSERT_LT(tmp, diff);
+    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
+
+    if (backward) {
+      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
+                                                             x_tensor,
+                                                             rulebook,
+                                                             kernel_tensor,
+                                                             out,
+                                                             paddings,
+                                                             dilations,
+                                                             strides,
+                                                             1,
+                                                             subm);
+      f_verify(grads[0].data<T>(), features_grad);
+      f_verify(grads[1].data<T>(), kernel_grad);
     }
   }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_kernel_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
+
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
+                                            d_x_tensor,
+                                            d_kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            subm,
+                                            &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                                                           d_x_tensor,
+                                                           d_rulebook,
+                                                           d_kernel_tensor,
+                                                           d_out,
+                                                           paddings,
+                                                           dilations,
+                                                           strides,
+                                                           1,
+                                                           subm);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
+    phi::Copy(dev_ctx_gpu, grads[0], phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+
+    DenseTensor h_kernel_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[1].dtype(), grads[1].dims(), grads[1].layout()));
+    phi::Copy(dev_ctx_gpu, grads[1], phi::CPUPlace(), true, &h_kernel_grad);
+    f_verify(h_kernel_grad.data<T>(), kernel_grad);
+  }
+#endif
 }
 
 void TestConv3d(const std::vector<int>& indices,
@@ -141,7 +267,12 @@ void TestConv3d(const std::vector<int>& indices,
                 const int non_zero_num,
                 const std::vector<int>& paddings,
                 const std::vector<int>& strides,
-                const std::vector<int>& dilations) {
+                const std::vector<int>& dilations,
+                const float diff = 1e-3,
+                const bool backward = false,
+                const std::vector<float> features_grad = {},
+                const std::vector<float> kernel_grad = {},
+                const bool subm = false) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -154,7 +285,12 @@ void TestConv3d(const std::vector<int>& indices,
                         non_zero_num,
                         paddings,
                         strides,
-                        dilations);
+                        dilations,
+                        diff,
+                        backward,
+                        features_grad,
+                        kernel_grad,
+                        subm);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -167,7 +303,12 @@ void TestConv3d(const std::vector<int>& indices,
                          non_zero_num,
                          paddings,
                          strides,
-                         dilations);
+                         dilations,
+                         diff,
+                         backward,
+                         cast<float, double>(features_grad),
+                         cast<float, double>(kernel_grad),
+                         subm);
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -467,5 +608,162 @@ TEST(DEV_API, sparse_conv2d) {
              dilations);
 }
 
+TEST(DEV_API, sparse_conv3d_backward) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 2;
+  std::vector<int> indices_flatten = {0, 0, 0, 2, 3, 2, 3, 2};
+
+  std::vector<float> features = {-0.28833008, 0.0287323};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.64306641, 0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641,
+      0.57861328, 0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038,
+      0.46459961, 0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077,
+      0.69628906, 0.26904297, 0.74707031, 0.54003906, 0.5390625,  0.07958984,
+      0.47338867, 0.90966797, 0.17126465};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
+                                          1, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+
+  std::vector<float> out_features = {4.9200e-03,
+                                     2.6140e-02,
+                                     2.2900e-03,
+                                     -2.3596e-01,
+                                     1.5000e-04,
+                                     1.0670e-02,
+                                     5.7200e-03,
+                                     1.2850e-02};
+
+  std::vector<float> features_grad = {-0.20593, -0.09149};
+  std::vector<float> kernel_grad = {
+      0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,  0.000e+00,
+      0.000e+00, 0.000e+00, 6.805e-02, 0.000e+00, 0.000e+00,  0.000e+00,
+      0.000e+00, 3.700e-04, 1.600e-04, 0.000e+00, 3.100e-04,  0.000e+00,
+      0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, -6.780e-03, 7.000e-05,
+      0.000e+00, 7.500e-04, 1.400e-04};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad);
+}
+
+TEST(DEV_API, sparse_conv2d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 4, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 4, 5, out_channels};
+  std::vector<int> paddings = {0, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<int> indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> features = {0.8854, 0.6505, -0.1999, 0.3583};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.9364, 0.9460, 0.6564, 0.7999, 0.2013, 0.3812, 0.5474, 0.1016, 0.3368};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> out_features = {0.1782, 0.2313, 0.7117, 0.5214};
+
+  std::vector<float> features_grad = {0.0359, 1.2080, 0.5838, 0.4541};
+  std::vector<float> kernel_grad = {
+      0.3391, 0.4630, 0.0000, -0.1042, 0.3528, 0.2550, 0.0000, -0.0462, 0.0829};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
+TEST(DEV_API, sparse_conv3d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 5, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 4, 4, 5, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> features = {-0.9578, 0.1572, 0.1036};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.1367, 0.4534, 0.2138, 0.8264, 0.7534, 0.3270, 0.2880, 0.1562, 0.7770,
+      0.6902, 0.1981, 0.1369, 0.6582, 0.7582, 0.5640, 0.8894, 0.7350, 0.1845,
+      0.6892, 0.3654, 0.6076, 0.0326, 0.8412, 0.5289, 0.9824, 0.8235, 0.9802};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> out_features = {-0.7262, 0.1192, 0.0785};
+
+  std::vector<float> features_grad = {-0.5506, 0.0904, 0.0595};
+  std::vector<float> kernel_grad = {
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.7224, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index 3e2ad0495f3ba85836dc08afa3f4fa4ed0b10afd..b8f214b79e290c2e102fc2c08dab2ddc6a61dd71 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -90,6 +90,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
 
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out =
@@ -300,6 +304,11 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_cpu, csr);
   CheckResult<T, int64_t>(&dev_ctx_cpu,
                           cpu_sparse_out,
@@ -473,6 +482,11 @@ void TestCooToCsr(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_cpu, coo);
   CheckCsrResult<T, int64_t>(&dev_ctx_cpu,
                              cpu_sparse_out,
@@ -563,6 +577,11 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
@@ -667,6 +686,11 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           const int64_t non_zero_num,
                           const int64_t sparse_dim) {
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
@@ -836,6 +860,11 @@ void TestSparseCsrToDense(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   DenseTensor cpu_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_cpu, csr);
   int cmp_cpu = memcmp(cpu_sparse_out.data<T>(),
                        dense_data.data(),
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index a6c9a27de7dc5a3a9f09d4c336fe9b50e4d453a5..88c9193a8f8949bd6f315c9c4bdf89d6029a8696 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -114,5 +114,375 @@ TEST(ARG_MAP, fill_constant) {
   ASSERT_EQ(signature9.name, "full_sr");
 }
 
+TEST(ARG_MAP, set_value) {
+  TestArgumentMappingContext arg_case(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case1(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case1).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case2).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case3(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case3).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case4(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case4).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case5(
+      {"Input", "StartsTensorList", "EndsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case5).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case6(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case6).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case7(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case7).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case8(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case8).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case9(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case9).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case10(
+      {"Input", "StartsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case10).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case11(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case11).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case12(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case12).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case13(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case13).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case14(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case14).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case15(
+      {"Input", "StartsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case15).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case16(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case16).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case17(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case17).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case18(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case18).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case19(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case19).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case20(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case20).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case21(
+      {"Input", "EndsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case21).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case22(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case22).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case23(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case23).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case24(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case24).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case25(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case25).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case26(
+      {"Input", "EndsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case26).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case27(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case27).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case28(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case28).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case29(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case29).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case30(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case30).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case31(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case31).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case32(
+      {"Input", "StepsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case32).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case33(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case33).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case34(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case34).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case35(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case35).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case36(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case36).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case37(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case37).name,
+      "set_value");
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index fcd2d397fa2db593dffe6b0c898efedc2e62cd81..06048f33d940a28ddf9e3aa488a6e24a9e4a93b6 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -80,6 +80,8 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_outputs.count(name) > 0;
   }
 
+  bool IsForInferShape() const override { return false; }
+
  private:
   const std::unordered_set<std::string> dense_tensor_inputs;
   const std::unordered_set<std::string> selected_rows_inputs;
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 8d858647ea63dc49499da6ac626b406e4653425e..76b45ff89f1869cc5e401b5c1b4151ad14158259 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -32,15 +32,18 @@ function update_pd_ops() {
    # compile and install paddle
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-   cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
-   make -j8 paddle_python
+   cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
+   make -j8 paddle_python print_pten_kernels kernel_signature_generator
    cd ${PADDLE_ROOT}/build
+   ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
+   ./paddle/fluid/pybind/kernel_signature_generator > ../tools/infrt/kernel_signature.json
    cd python/dist/
    python3 -m pip uninstall -y paddlepaddle
    python3 -m pip install  *whl
    # update pd_ops.td
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
+   python3 generate_phi_kernel_dialect.py
 }
 
 function init() {
@@ -90,7 +93,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -102,9 +105,11 @@ function infrt_gen_and_build() {
 
 function create_fake_models() {
     cd ${PADDLE_ROOT}/build
+    cd python/dist/
     # create multi_fc model, this will generate "multi_fc_model"
     python3 -m pip uninstall -y paddlepaddle
-    python3 -m pip install paddlepaddle
+    python3 -m pip install  *whl
+    cd ${PADDLE_ROOT}/build
     python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py
 }
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4d7451f435271b4aaca3010e643ddcb5fbb28191..84f7a57999fd66a6c24ae3ccf88c93f9beaa97e5 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -242,6 +242,7 @@ function cmake_base() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} 
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF}
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}"
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -293,7 +294,9 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}" \
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} \
-        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
+        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}  \
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
+        
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -330,7 +333,7 @@ function check_style() {
 
     # pre-commit use python3.8.0 
     OLD_PATH=$PATH
-    export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+    export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
 
     pre-commit install
     clang-format --version
@@ -776,7 +779,9 @@ set +x
             tmpfile=$tmp_dir/$tmpfile_rand
             ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
         fi
-
+        ut_total_endTime_s=`date +%s`
+        echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
+        
         collect_failed_tests
         rm -f $tmp_dir/*
         exec_times=0
@@ -1266,7 +1271,9 @@ function card_test() {
     elif [ "${WITH_ASCEND_CL}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
-        CUDA_DEVICE_COUNT=4
+        CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
+    elif [ "${WITH_MLU}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -2100,6 +2107,130 @@ set -ex
     fi   
 }
 
+function parallel_test_base_mlu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/mlu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit mlu tests ...
+    ========================================
+EOF
+
+set +x
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+
+        ut_actual_total_startTime_s=`date +%s`
+
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single MLU
+        collect_failed_tests
+
+        # add unit test retry for MLU
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                fi
+
+                            done
+
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                    else 
+                        break
+                    fi
+
+                done
+        fi
+
+        rerun_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -2115,6 +2246,8 @@ function parallel_test() {
         parallel_test_base_xpu
     elif [ "$WITH_ASCEND_CL" == "ON" ];then
         parallel_test_base_npu
+    elif [ "$WITH_MLU" == "ON" ];then
+        parallel_test_base_mlu
     else
         parallel_test_base_cpu ${PROC_RUN:-1}
     fi
@@ -2374,7 +2507,8 @@ EOF
     fi
     startTime_s=`date +%s`
     set +e
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto};build_error=$?
+
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
@@ -2418,7 +2552,7 @@ EOF
     demo_ci_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr}
+             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     DEMO_EXIT_CODE=$?
     ./clean.sh
     demo_ci_endTime_s=`date +%s`
@@ -2428,7 +2562,7 @@ EOF
     infer_ut_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/tests/infer_ut
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_ROOT_DIR:-/usr}
+             ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     TEST_EXIT_CODE=$?
     infer_ut_endTime_s=`date +%s`
     echo "infer_ut tests Total time: $[ $infer_ut_endTime_s - $infer_ut_startTime_s ]s"
@@ -2624,17 +2758,20 @@ function build_pr_and_develop() {
     fi
 
     git fetch upstream develop
+    git checkout develop
     dev_commit=`git log -1|head -1|awk '{print $2}'`
     dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
     url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
     if [ "$url_return" == '200' ];then
-        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+        cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist
     else
         git checkout -b develop_base_pr upstream/$BRANCH
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         generate_api_spec "$1" "DEV"
         mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
     fi
+
 }
 
 function build_develop() {
@@ -2739,7 +2876,9 @@ function main() {
         test_fluid_lib
         ;;
       build_inference_lib)
-        python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        if [ "${WITH_PYTHON}" == "OFF" ] ; then
+            python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        fi
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_lib ${parallel_number}
         ;;
@@ -2790,7 +2929,9 @@ function main() {
         ;;
       test_inference)
         PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-        python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        if [ "${WITH_PYTHON}" == "OFF" ] ; then
+            python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        fi
         gen_fluid_lib ${parallel_number}
         test_fluid_lib
         #test_fluid_lib_train
@@ -2800,7 +2941,9 @@ function main() {
         ;;
       build_inference)
         PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-        python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        if [ "${WITH_PYTHON}" == "OFF" ] ; then
+            python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        fi
         gen_fluid_lib ${parallel_number}
         ;;
       gpu_inference)
@@ -2865,6 +3008,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_mlu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index eace7c41f4a3111e637bc20f02a4a55bccd46092..0cc68bf31617c4894a27ece3749ed643b34a52a1 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,5 +1,5 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc)
 endif()
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9ca29d509f60e5c3eb435e816a32cc14aa92e921..5132f23079f1fe6d76a1fc70ff9da229ec96abee 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -107,9 +107,9 @@ def decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -122,7 +122,7 @@ def decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index fc299bc7b552b7a4ad6b2d9f1a9e8e5c3aefc560..a0ae9bc29dabe2172bfec4853315b4d6eb20c15b 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -55,6 +55,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
 from . import cloud_utils  # noqa: F401
 from . import utils  # noqa: F401
 
+from .sharding import *  # noqa: F401
 
 __all__ = [  # noqa
       "spawn",
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 8efb9eb719237fd433b9fb02b0772eb6581319e4..56beb8957415d3c3c401fdbf754cb17fc5e253a7 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -99,11 +99,11 @@ class Engine:
             all_ranks = world_process_group.ranks
             for rank in all_ranks:
                 self._parallel(rank)
-        place = _get_device()
-        if isinstance(place, fluid.CUDAPlace):
+        self._place = _get_device()
+        if isinstance(self._place, fluid.CUDAPlace):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
         if self._executor is None:
-            self._executor = fluid.Executor(place)
+            self._executor = paddle.static.Executor(self._place)
 
     def _build(self):
         serial_main_prog = self._serial_main_progs.get(self.mode, None)
@@ -119,12 +119,13 @@ class Engine:
             labels = [s._create_feed_layer() for s in to_list(labels_spec)]
             self._input_vars = inputs
             self._label_vars = labels
-            feed_list = self._input_vars + self._label_vars
+            self._feed_vars = self._input_vars + self._label_vars
             outputs = to_list(self.model(*inputs))
             if self.mode != "predict" and self.loss:
                 loss = self.loss(*(outputs + labels))
                 self._loss_var = loss
 
+        self._fetch_vars = {"outputs": outputs, "loss": loss}
         self._serial_main_progs[self.mode] = serial_main_prog
         self._serial_startup_progs[self.mode] = serial_startup_prog
         self._dist_contexts[self.mode] = DistributedContext(
@@ -278,19 +279,32 @@ class Engine:
         dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
         dist_context = self._dist_contexts[self.mode]
         dist_main_block = dist_main_prog.global_block()
+        serial_main_prog = self._serial_main_progs[self.mode]
+        serial_main_block = serial_main_prog.global_block()
         op_size = len(dist_main_block.ops)
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
             dataloader = NonIterableGeneratorLoader(
                 dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
         new_op_size = len(dist_main_block.ops)
-        for idx in range(new_op_size - 1, op_size - 1, -1):
+        for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(op.desc)
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
+            for in_name in new_op.input_arg_names:
+                if in_name == "lod_tensor_blocking_queue_0":
+                    continue
+                if in_name not in dist_main_block.vars:
+                    in_var = serial_main_block._var_recursive(in_name)
+                    dist_main_block._clone_variable(in_var, in_var.persistable)
+            for out_name in new_op.output_arg_names:
+                if out_name not in dist_main_block.vars:
+                    out_var = serial_main_block._var_recursive(out_name)
+                    dist_main_block._clone_variable(out_var,
+                                                    out_var.persistable)
             dist_op = DistributedOperator(new_op)
             dist_context.add_dist_op_for_program(dist_op)
         for _ in range(new_op_size - op_size):
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 9f84df2d8963432b76abd8a5c03efae03bc3560c..db6f909f8ca7da66366656b33c02fa4f647ad5bb 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -27,3 +27,4 @@ from . import dist_eltwise
 from . import dist_check_finite_and_unscale
 from . import dist_update_loss_scaling
 from . import dist_split
+from . import dist_fill_constant_batch_size_like
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
old mode 100755
new mode 100644
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 94eb0d2d469f0595fdc8cb31821d6cded9ad064a..32f8e2acef5e103c870b6861ade4dc334c7329b5 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -155,7 +155,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
             kwargs['Out'])
 
         Ids_var = main_block.var(kwargs['Ids'][0])
-        Weight_var = main_block.var(kwargs['W'][0])
+        Weight_var = main_block._var_recursive(kwargs['W'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # got dist attribute info
@@ -277,7 +277,8 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
 
         # param initialization sync
         if Weight_var.is_parameter and not op_dist_attr.is_recompute:
-            assert Weight_var.name not in dist_op_context.already_init_sync_vars
+            if Weight_var.name in dist_op_context.already_init_sync_vars:
+                return
             dist_op_context.already_init_sync_vars.add(Weight_var.name)
             param = startup_block.var(Weight_var.name)
             param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c9d9eda02e1bfbc855c0a7ad0e943e44f362fce
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard
+from ..utils import is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedFillConstantBatchSizeLike, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like"))
+
+
+class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        shape_list = op_desc.attr("shape")
+
+        if len(shape_list) != len(out_dims_mapping):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        in_name = op_desc.input('Input')[0]
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+
+        # the dim_mapping of batch dimension should be the same
+        return out_dims_mapping[0] == in_dims_mapping[0]
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        # only the batch size dimemsion of input and output are relative.
+        dim_changed = compute_compatible_and_update_dim_mapping(
+            [x_dims_mapping, out_dims_mapping], [0, 0])
+        if dim_changed:
+            changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+        dist_op_context = ctx.dist_op_context
+        src_op = dist_op_context.cur_src_op
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        main_block = dist_op_context.work_block
+        op = main_block.ops[-1]
+        assert op.type == "fill_constant_batch_size_like"
+
+        # modify shape attr according to how output are partitioned
+        out_name = op.output('Out')[0]
+        dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        shape_list = op.attr("shape")
+        # modify target shape
+        for idx, axis in enumerate(dims_mapping):
+            if axis >= 0:
+                shape_list[idx] = shape_list[idx] // process_mesh_shape[axis]
+
+        op._set_attr("shape", shape_list)
+        main_block._sync_with_cpp()
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl(
+    "fill_constant_batch_size_like",
+    DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 9eb24a65e608c22573342f32dfd0dc96a601e3ac..058ae1d0a9fd5c25ec83ea15ed9c2e479322957c 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -433,8 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format(
-        Weight_var.name, dist_op_context.already_init_sync_vars)
+    if Weight_var.name in dist_op_context.already_init_sync_vars:
+        return
     assert startup_block.has_var(Weight_var.name)
     dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
@@ -819,6 +819,8 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
@@ -1323,6 +1325,8 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 2f88407c093a534d1d67133aece636127ff29626..ed5ec85d84f224a38358dd531f3df490e3c160f1 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -285,6 +285,9 @@ def _get_dist_shape(var, dist_attr):
     var_shape = var.shape
     mapping = dist_attr.dims_mapping
     mesh = dist_attr.process_mesh.topology
+    if mapping == []:
+        return var_shape
+
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 4cc710b226d8f84fadd249a148e754d5330fb564..c6afcfec8a0082bc2a20e88275d5ba428fa3aaf1 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -29,6 +29,7 @@ from .process_group import new_process_group, ProcessGroup, _g_process_group_map
 
 # NOTE: If op in _g_special_ops, it will not be resharded. 
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
+while_block_info = {}
 
 
 class AllGatherOpDesc:
@@ -280,8 +281,20 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(dist_tensor, dist_op, op_input=True):
+def _need_reshard(dist_tensor,
+                  dist_op,
+                  actual_process_mesh,
+                  program,
+                  dist_context,
+                  op_input=True):
     """Judge the tensor whether needs to be resharded."""
+
+    def _is_unshard(dims_mapping):
+        for dim in dims_mapping:
+            if dim != -1:
+                return False
+        return True
+
     is_reshard = False
     tensor_dist_attr = dist_tensor.dist_attr
     tensor_name = dist_tensor.serial_tensor.name
@@ -289,32 +302,74 @@ def _need_reshard(dist_tensor, dist_op, op_input=True):
     tensor_process_mesh = tensor_dist_attr.process_mesh
     op_dist_attr = dist_op.dist_attr
     op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-    op_process_mesh = op_dist_attr.process_mesh
+    op_process_mesh = actual_process_mesh
     if op_input:
         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_input_dims_mapping, op_process_mesh
                 ])):
-            if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
-                is_reshard = True
+            # dims_mapping
+            if tensor_dims_mapping != op_input_dims_mapping:
+                if dist_op.serial_op.type == "while":
+                    sub_block = program.blocks[dist_op.serial_op.attr(
+                        "sub_block").id]
+                    for op in sub_block.ops:
+                        for var_name in op.input_arg_names:
+                            if var_name == tensor_name:
+                                dist_op_attr = dist_context.get_dist_op_for_program(
+                                    op).dist_attr
+                                var_dims_mapping = dist_op_attr.get_input_dims_mapping(
+                                    var_name)
+                                if var_dims_mapping != tensor_dims_mapping:
+                                    is_reshard = True
+                                    break
+                else:
+                    is_reshard = True
+            # process_mesh
+            if tensor_process_mesh != op_process_mesh:
+                # when processes length is not the same, the dims mapping must be replicative now
+                if len(tensor_process_mesh.processes) != len(
+                        op_process_mesh.processes):
+                    assert _is_unshard(tensor_dims_mapping)
+                    assert _is_unshard(op_input_dims_mapping)
+                else:
+                    if dist_tensor.serial_tensor.dtype == paddle.bool:
+                        raise ValueError("Bool var is not supported reshard.")
+
+                    # for while op, it should find the process mesh of op actually used the tensor as input
+                    if dist_op.serial_op.type == "while":
+                        sub_block = program.blocks[dist_op.serial_op.attr(
+                            "sub_block").id]
+                        for op in sub_block.ops:
+                            for var_name in op.input_arg_names:
+                                if var_name == tensor_name:
+                                    dist_op_attr = dist_context.get_dist_op_for_program(
+                                        op).dist_attr
+                                    process_mesh = dist_op_attr.process_mesh
+                                    if process_mesh == op_process_mesh:
+                                        is_reshard = True
+                                        break
+                    else:
+                        is_reshard = True
     else:
         op_output_dims_mapping = op_dist_attr.get_output_dims_mapping(
             tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_output_dims_mapping, op_process_mesh
                 ])):
             if tensor_process_mesh != op_process_mesh:
+                if dist_tensor.serial_tensor.dtype == paddle.bool:
+                    raise ValueError("Bool var is not supported reshard.")
                 is_reshard = True
             if tensor_dims_mapping != op_output_dims_mapping:
                 raise ValueError(
                     "It is not supported that tensor dims mapping is different from op output dims mapping."
                 )
+
     return is_reshard
 
 
@@ -329,13 +384,14 @@ def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
     return complete_shape
 
 
-def find_op_desc_seq(dist_tensor, dist_op):
+def find_op_desc_seq(dist_tensor, dist_op, actual_process_mesh, batch_size):
     """
     Find the op description sequence to reshard the source tensor for matching the op requirement.
 
     Args:
         dist_tensor (DistributedTensor): A distributed tensor.
         dist_op (DistributedOperator): A distributed operator.
+        actual_process_mesh (ProcessMesh): The actual op process mesh.
 
     Returns:
         Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
@@ -350,11 +406,16 @@ def find_op_desc_seq(dist_tensor, dist_op):
     source_process_shape = source_process_mesh.topology
 
     op_dist_attr = dist_op.dist_attr
-    target_process_mesh = op_dist_attr.process_mesh
+    target_process_mesh = actual_process_mesh
     target_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
     target_process_group = target_process_mesh.processes
     target_process_shape = target_process_mesh.topology
 
+    if source_tensor.shape[0] < 0:
+        new_shape = list(source_tensor.shape)
+        new_shape[0] = batch_size
+        source_tensor.desc.set_shape(new_shape)
+
     complete_shape = _compute_complete_shape(
         source_tensor.shape, source_process_shape, source_dims_mapping)
     op_desc_seq = {}
@@ -503,7 +564,7 @@ def find_op_desc_seq(dist_tensor, dist_op):
     return op_desc_seq
 
 
-def _insert_send_op(block, idx, tensor, dst):
+def _insert_send_op(block, idx, tensor, dst, op_role):
     """Insert send op into block at the given index."""
     op_type = 'send_v2'
     block._insert_op(
@@ -514,10 +575,11 @@ def _insert_send_op(block, idx, tensor, dst):
             'ring_id': 0,
             'peer': dst,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_recv_op(block, idx, tensor, src):
+def _insert_recv_op(block, idx, tensor, src, op_role):
     """Insert recv op into block at the given index."""
     op_type = 'recv_v2'
     block._insert_op(
@@ -531,14 +593,16 @@ def _insert_recv_op(block, idx, tensor, src):
             'out_shape': tensor.shape,
             'dtype': tensor.dtype,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_concat_op(block, idx, tensors, axis):
+def _insert_concat_op(block, idx, tensors, axis, op_role):
     """Insert concat op into block at the given block."""
     inputs = {'X': tensors}
     attrs = {}
     attrs['axis'] = axis
+    attrs['op_role'] = op_role
     helper = LayerHelper('concat', **locals())
     with paddle.static.program_guard(block.program):
         out = helper.create_variable_for_type_inference(
@@ -548,7 +612,8 @@ def _insert_concat_op(block, idx, tensors, axis):
     return out
 
 
-def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
+def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
+                     op_role):
     """Insert slice op into block at the given block."""
     inputs = {'Input': tensor}
     infer_flags = list(1 for i in range(len(axes)))
@@ -556,24 +621,23 @@ def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
         "axes": axes,
         "starts": starts,
         "ends": ends,
-        "infer_flags": infer_flags
+        "infer_flags": infer_flags,
+        'op_role': op_role
     }
     helper = LayerHelper('slice', **locals())
     out = block.create_var(
-        name=new_var_name,
-        dtype=tensor.dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR)
+        name=new_var_name, dtype=tensor.dtype, type=tensor.type)
     block._insert_op(
         idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
     return out
 
 
-def _insert_split_op(block, idx, tensor, num_or_sections):
+def _insert_split_op(block, idx, tensor, num_or_sections, op_role):
     """Insert split op into block at the given index."""
     helper = LayerHelper('split', **locals())
     input_shape = tensor.shape
     inputs = {'X': tensor}
-    attrs = {'num': num_or_sections, "axis": 0}
+    attrs = {'num': num_or_sections, 'axis': 0, 'op_role': op_role}
     with paddle.static.program_guard(block.program):
         outs = [
             helper.create_variable_for_type_inference(
@@ -584,7 +648,7 @@ def _insert_split_op(block, idx, tensor, num_or_sections):
     return outs
 
 
-def _insert_allgather_op(block, idx, tensor, ranks):
+def _insert_allgather_op(block, idx, tensor, ranks, op_role):
     """Insert allgather op into block at the given index."""
 
     def _insert_fill_constant_op(block, idx):
@@ -597,6 +661,7 @@ def _insert_allgather_op(block, idx, tensor, ranks):
         attrs['str_value'] = str(int("1"))
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
+        attrs['op_role'] = op_role
         utils.get_shape_tensor_inputs(
             inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
         block._insert_op(
@@ -625,14 +690,16 @@ def _insert_allgather_op(block, idx, tensor, ranks):
             inputs={'X': [fill_constant_out]},
             outputs={'Out': [fill_constant_out]},
             attrs={'ring_id': 0,
-                   'use_calc_stream': True})
+                   'use_calc_stream': True,
+                   'op_role': op_role})
 
         # insert c_sync_calc_stream op
         block._insert_op(
             idx + 2,
             type="c_sync_calc_stream",
             inputs={'X': [fill_constant_out]},
-            outputs={'Out': [fill_constant_out]})
+            outputs={'Out': [fill_constant_out]},
+            attrs={'op_role': op_role})
         idx_offset = 3
 
     # insert c_allgather op
@@ -649,20 +716,21 @@ def _insert_allgather_op(block, idx, tensor, ranks):
         attrs={
             'ring_id': group.id,
             'use_calc_stream': True,
-            'nranks': group.nranks
+            'nranks': group.nranks,
+            'op_role': op_role
         })
     idx_offset += 1
 
     # insert split op
     split_out = _insert_split_op(block, idx + idx_offset, allgather_out,
-                                 group.nranks)
+                                 group.nranks, op_role)
     idx_offset += 1
     tensor_list.extend(split_out)
     return tensor_list, idx_offset
 
 
 def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
-                               block, idx):
+                               block, idx, op_role):
     """Concat the tensors and insert concat op."""
     if not partition_tensor_list:
         partition_tensor_list.append((tensor, partition_index))
@@ -674,13 +742,13 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
                 partition_tensor_list[i][1], partition_index)
             if concat_axis != -1:
                 has_concat = True
-                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \
+                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis, op_role) \
                     if first_order == 0 else \
-                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis)
+                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis, op_role)
                 partition_tensor_list.pop(i)
                 idx[0] += 1
                 _concat_partitions_with_op(partition_tensor_list, _,
-                                           new_partition, block, idx)
+                                           new_partition, block, idx, op_role)
                 break
             i += 1
         if not has_concat:
@@ -692,8 +760,47 @@ HAS_RECV = {}
 HAS_ALLGATHER = {}
 
 
-def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
-                  dist_context):
+def _get_while_op_actual_process_mesh(op, program, rank_id, dist_context):
+    """Get the while op actual Process mesh corresponding to rank"""
+    assert op.type == "while"
+    while_op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    actual_process_mesh = None
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh == while_op_process_mesh:
+            continue
+        if rank_id in process_mesh.processes:
+            raw_process_mesh = process_mesh
+            break
+
+    if actual_process_mesh is None and rank_id in while_op_process_mesh.processes:
+        actual_process_mesh = while_op_process_mesh
+
+    assert actual_process_mesh is not None
+    return actual_process_mesh
+
+
+def _get_var(var_name, block, program):
+    """Get var in the parent block if not found in the current block"""
+    var = None
+    if var_name in block.vars:
+        var = block.vars[var_name]
+    else:
+        parent_block = program.blocks[block.parent_idx]
+        if var_name in parent_block.vars:
+            var = parent_block.vars[var_name]
+    assert var is not None
+    return var
+
+
+def parse_op_desc(block, rank_id, op_desc_seq, var_name, reshard_op,
+                  dist_context, program, actual_process_mesh):
     """Parse op desc sequence and insert op in the block"""
     global HAS_SENT
     global HAS_RECV
@@ -703,9 +810,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
     if rank_id not in op_desc_seq.keys():
         return
     op_desc_list = op_desc_seq[rank_id]
-    block = program.global_block()
-    assert var_name in block.vars.keys(
-    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)
 
     idx = None
     for index, op in list(enumerate(block.ops)):
@@ -716,7 +820,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
         rank_id)
 
     matched_op = block.ops[idx]
-    source_tensor = block.vars[var_name]
+    source_tensor = _get_var(var_name, block, program)
     for op_desc in op_desc_list:
         if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
             if var_name not in HAS_ALLGATHER.keys():
@@ -724,7 +828,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
                     map(lambda x: x[0], HAS_ALLGATHER[var_name])):
                 tensor_list, idx_offset = _insert_allgather_op(
-                    block, idx, source_tensor, op_desc.group)
+                    block, idx, source_tensor, op_desc.group,
+                    reshard_op.attr('op_role'))
                 idx += idx_offset
                 tensor_name_list = [var.name for var in tensor_list]
                 HAS_ALLGATHER[var_name].append(
@@ -743,7 +848,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if var_name not in HAS_SENT.keys():
                 HAS_SENT[var_name] = []
             if op_desc.dst not in HAS_SENT[var_name]:
-                _insert_send_op(block, idx, source_tensor, op_desc.dst)
+                _insert_send_op(block, idx, source_tensor, op_desc.dst,
+                                reshard_op.attr('op_role'))
                 idx += 1
                 HAS_SENT[var_name].append(op_desc.dst)
 
@@ -758,8 +864,10 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 recv_tensor = block.create_var(
                     name=unique_name.generate(var_name + "@recv"),
                     shape=shape,
-                    dtype=source_tensor.dtype)
-                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
+                    dtype=source_tensor.dtype,
+                    type=source_tensor.type)
+                _insert_recv_op(block, idx, recv_tensor, op_desc.src,
+                                reshard_op.attr('op_role'))
                 tensor_list.append(recv_tensor)
                 idx += 1
                 HAS_RECV[var_name][op_desc.src] = recv_tensor
@@ -772,7 +880,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             for index, tensor in enumerate(tensor_list):
                 _concat_partitions_with_op(partition_tensor_list, tensor,
                                            partition_index_list[index], block,
-                                           idx_list)
+                                           idx_list, reshard_op.attr('op_role'))
             idx = idx_list[0]
 
         elif isinstance(op_desc, SliceOpDesc):
@@ -787,11 +895,11 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 starts=op_desc.starts,
                 ends=op_desc.ends,
                 axes=op_desc.axes,
-                new_var_name=new_name)
+                new_var_name=new_name,
+                op_role=reshard_op.attr('op_role'))
 
             tensor_attr = TensorDistributedAttribute()
-            process_mesh = dist_context.get_op_dist_attr_for_program(
-                matched_op).process_mesh
+            process_mesh = actual_process_mesh
             dims_mapping = dist_context.get_op_dist_attr_for_program(
                 matched_op).get_input_dims_mapping(var_name)
             tensor_attr.dims_mapping = dims_mapping
@@ -799,11 +907,29 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             dist_context.set_tensor_dist_attr_for_program(target_tensor,
                                                           tensor_attr)
 
+            if op.type == "while":
+                global while_block_info
+                # var_reshard_mapping means the while op input need be changed to 
+                if "var_reshard_mapping" not in while_block_info[op.attr(
+                        "sub_block").id].keys():
+                    while_block_info[op.attr("sub_block").id][
+                        "var_reshard_mapping"] = {}
+                while_block_info[op.attr("sub_block").id][
+                    "var_reshard_mapping"][var_name] = target_tensor.name
+
             # rename op input name according to new name
             for op in block.ops:
                 for name in op.input_arg_names:
                     op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                     if name == var_name and op_dist_attr is not None:
+                        if op.desc.id() == matched_op.desc.id():
+                            op.desc._rename_input(name, target_tensor.name)
+                            op_dist_attr.set_input_dims_mapping(
+                                target_tensor.name, dims_mapping)
+                            op_dist_attr.set_input_dist_attr(name, None)
+                            continue
+
+                        # NOTE: For op whose process mesh is a union, its input will not be renamed by other op reshard result now which means that it will have more reshard operation.
                         op_process_mesh = op_dist_attr.process_mesh
                         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                             var_name)
@@ -819,102 +945,166 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
     not_remove_op_ref = [
         "create_py_reader", "create_double_buffer_reader", "read"
     ]
-    remove_op_idx = []
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    for idx, op in enumerate(ops):
-        # handle read op in the pipeline scene specially, it will be removed in the future.
-        if op.type == "read":
-            dim_list = []
-            for var_name in op.output_arg_names:
-                dim_list.extend(vars[var_name].shape)
-            for i in range(idx, -1, -1):
-                if ops[i].type == "create_py_reader":
-                    ops[i]._set_attr("shape_concat", dim_list)
-                    break
-            continue
-
-        # replace the input and output of c_sync_comm_stream op when in pipeline scene.
-        if op.type == "c_sync_comm_stream":
-            need_save = []
-            for var_name in op.input_arg_names:
-                process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                    vars[var_name]).process_mesh
-                if rank_id in process_mesh.processes:
-                    need_save.append(var_name)
-            if not need_save:
-                remove_op_idx.append(idx)
+    global while_block_info
+
+    # NOTE: The nested sub block is not be supported now.
+    remove_block_order = []
+    for block_idx in while_block_info:
+        remove_block_order.append(block_idx)
+
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx not in remove_block_order:
+            remove_block_order.append(block_idx)
+
+    # the sub block should be removed first
+    for block_idx in remove_block_order:
+        remove_op_idx = []
+        block = auto_parallel_main_prog.blocks[block_idx]
+        ops = block.ops
+        vars = block.vars
+        for idx, op in enumerate(ops):
+            if op.type == "read":
+                dim_list = []
+                for var_name in op.output_arg_names:
+                    dim_list.extend(
+                        _get_var(var_name, block, auto_parallel_main_prog)
+                        .shape)
+                for i in range(idx, -1, -1):
+                    if ops[i].type == "create_py_reader":
+                        ops[i]._set_attr("shape_concat", dim_list)
+                        break
                 continue
 
-            proto = OpProtoHolder.instance().get_op_proto(op.type)
-            op.desc.set_input(proto.inputs[0].name, need_save)
-            op.desc.set_output(proto.outputs[0].name, need_save)
-            continue
+            # replace the input and output of c_sync_comm_stream op when in pipeline scene.
+            if op.type == "c_sync_comm_stream":
+                need_save = []
+                for var_name in op.input_arg_names:
+                    process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                        _get_var(var_name, block,
+                                 auto_parallel_main_prog)).process_mesh
+                    if rank_id in process_mesh.processes:
+                        need_save.append(var_name)
+                if not need_save:
+                    remove_op_idx.append(idx)
+                    continue
 
-        # judge the other op whether should be removed.
-        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-        if op_dist_attr is not None:
-            op_process_mesh = op_dist_attr.process_mesh
-            if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
-                remove_op_idx.append(idx)
+                proto = OpProtoHolder.instance().get_op_proto(op.type)
+                op.desc.set_input(proto.inputs[0].name, need_save)
+                op.desc.set_output(proto.outputs[0].name, need_save)
+                continue
 
-    for idx in remove_op_idx[::-1]:
-        block._remove_op(idx)
+            # judge the other op whether should be removed.
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            if op_dist_attr is not None:
+                op_process_mesh = op_dist_attr.process_mesh
+                if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
+                    remove_op_idx.append(idx)
+
+        for idx in remove_op_idx[::-1]:
+            block._remove_op(idx)
 
 
 def _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads):
     """Remove no need vars in the main program"""
-    remove_vars = set()
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    need_vars = set()
-    for op in ops:
-        for var_name in op.input_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-        for var_name in op.output_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-    for var in vars:
-        if var not in need_vars:
-            remove_vars.add(var)
-
-    # change dist_params_grads
-    param_grad_map = {}
-    for op in ops:
-        if int(op.attr('op_role')) == int(OpRole.Optimize):
-            if "Param" in op.input_names and "Grad" in op.input_names:
-                param_name = op.input("Param")[0]
-                grad_name = op.input("Grad")[0]
-                param_grad_map[param_name] = grad_name
-
-    need_remove_idx = []
-    for idx, item in enumerate(dist_params_grads):
-        if item[0].name not in param_grad_map.keys():
-            need_remove_idx.append(idx)
-
-    for idx in need_remove_idx[::-1]:
-        dist_params_grads.pop(idx)
-
-    idx = 0
-    while idx < len(dist_params_grads):
-        param_name = dist_params_grads[idx][0].name
-        grad_name = dist_params_grads[idx][1].name
-        if grad_name != param_grad_map[param_name]:
-            dist_params_grads[idx] = (vars[param_name],
-                                      vars[param_grad_map[param_name]])
-        idx += 1
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        remove_vars = set()
+        ops = block.ops
+        vars = block.vars
+        need_vars = set()
+        for op in ops:
+            for var_name in op.input_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+            for var_name in op.output_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+        for var in vars:
+            if var not in need_vars:
+                remove_vars.add(var)
+
+        # change dist_params_grads, the optimize op just in block 0.
+        if block_idx == 0:
+            param_grad_map = {}
+            for op in ops:
+                if int(op.attr('op_role')) == int(OpRole.Optimize):
+                    if "Param" in op.input_names and "Grad" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        grad_name = op.input("Grad")[0]
+                        param_grad_map[param_name] = grad_name
+
+            need_remove_idx = []
+            for idx, item in enumerate(dist_params_grads):
+                if item[0].name not in param_grad_map.keys():
+                    need_remove_idx.append(idx)
+
+            for idx in need_remove_idx[::-1]:
+                dist_params_grads.pop(idx)
+
+            idx = 0
+            while idx < len(dist_params_grads):
+                param_name = dist_params_grads[idx][0].name
+                grad_name = dist_params_grads[idx][1].name
+                if grad_name != param_grad_map[param_name]:
+                    dist_params_grads[idx] = (vars[param_name],
+                                              vars[param_grad_map[param_name]])
+                idx += 1
 
-    for var in remove_vars:
-        block._remove_var(var)
+        for var in remove_vars:
+            block._remove_var(var)
+
+
+def _change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
+    """Change while op input and output after the corresponding sub block ops removed"""
+    global while_block_info
+    for sub_block_idx in while_block_info:
+        sub_block = auto_parallel_main_prog.blocks[sub_block_idx]
+        parent_while_op_id = while_block_info[sub_block_idx]["op_id"]
+        parent_block = auto_parallel_main_prog.blocks[sub_block.parent_idx]
+
+        sub_block_op_inputs = set()
+        sub_block_op_outputs = []
+        for op in sub_block.ops:
+            # skip the input and output of operators inserted in the reshard phase
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op:
+                for var_name in op.output_arg_names:
+                    if var_name not in sub_block_op_outputs:
+                        sub_block_op_outputs.append(var_name)
+                for var_name in op.input_arg_names:
+                    sub_block_op_inputs.add(var_name)
+
+        # find the while op
+        while_op = None
+        for op in parent_block.ops:
+            if op.desc.id() == parent_while_op_id and op.type == "while":
+                while_op = op
+                break
+
+        assert while_op is not None
+
+        # find the actual input and output of while op
+        proto = OpProtoHolder.instance().get_op_proto(while_op.type)
+        new_X = []
+        for var_name in while_op.input("X"):
+            if var_name in sub_block_op_inputs:
+                new_X.append(var_name)
+        assert new_X
+        while_op.desc.set_input(proto.inputs[0].name, new_X)
+
+        new_Out = []
+        for var_name in while_op.output("Out"):
+            for output_name in sub_block_op_outputs[::-1]:
+                if output_name.find(var_name) != -1:
+                    new_Out.append(output_name)
+        assert new_Out
+        while_op.desc.set_output(proto.outputs[0].name, new_Out)
 
 
 def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
                            dist_params_grads):
     """Remove no need vars and ops in the main program."""
     _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id)
+    _change_while_op_input_and_output(auto_parallel_main_prog, dist_context)
     _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads)
 
 
@@ -992,8 +1182,70 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
         startup_block._remove_op(idx)
 
 
-def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
-            dist_context, dist_params_grads):
+def _get_process_meshes(op, program, dist_context):
+    """Get all process meshes when op has sub block."""
+    assert op.has_attr("sub_block")
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    process_meshes = []
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh not in process_meshes and process_mesh != op_process_mesh:
+            process_meshes.append(process_mesh)
+
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def _is_condition_replicative(op, program, dist_context):
+    assert op.type == "while"
+    sub_block = program.blocks[op.attr("sub_block").id]
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_dist_attr = dist_op.dist_attr
+
+    # the dims mapping of condition tensor should be replicative
+    for var_name in op.input("Condition"):
+        var = _get_var(var_name, sub_block, program)
+        dist_tensor = dist_context.get_dist_tensor_for_program(var)
+        tensor_dist_attr = dist_tensor.dist_attr
+        var_dims_mapping = tensor_dist_attr.dims_mapping
+        for dim in var_dims_mapping:
+            if dim != -1:
+                return False
+
+    return True
+
+
+def _get_op_process_meshes(op, dist_context):
+    process_meshes = []
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_process_mesh = dist_op.dist_attr.process_mesh
+    for process_mesh in dist_context.process_meshes:
+        if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+        ) and len(process_mesh.processes) <= len(op_process_mesh.processes):
+            process_meshes.append(process_mesh)
+
+    # it means the process mesh is not a union when process meshes is null
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def reshard(auto_parallel_main_prog,
+            auto_parallel_startup_prog,
+            rank_id,
+            dist_context,
+            dist_params_grads,
+            batch_size=None):
     """
     Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute.
 
@@ -1019,65 +1271,137 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
             return True
         return False
 
-    block = auto_parallel_main_prog.global_block()
-    idx = 0
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
+    global while_block_info
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx in while_block_info:
+            if "var_reshard_mapping" in while_block_info[block_idx]:
+                var_reshard_mapping = while_block_info[block_idx][
+                    "var_reshard_mapping"]
+                for op in block.ops:
+                    for var_name in op.input_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_input(var_name,
+                                                  var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_input_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_input_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_input_dist_attr(var_name, None)
+
+                    # the outputs also need to be renamed when the output name is the same with input name
+                    for var_name in op.output_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_output(
+                                var_name, var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_output_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_output_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_output_dist_attr(var_name,
+                                                                  None)
+
+        idx = 0
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+
+            if _is_special_op(op):
+                idx += 1
+                continue
 
-        if _is_special_op(op):
-            idx += 1
-            continue
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None:
+                process_meshes = []
+                if op.type == "while":
+                    if not _is_condition_replicative(
+                            op, auto_parallel_main_prog, dist_context):
+                        raise ValueError(
+                            "Please check the condition due to the dims mapping is not replicative."
+                        )
+                    process_meshes = _get_process_meshes(
+                        op, auto_parallel_main_prog, dist_context)
+                    assert process_meshes
+                    if op.attr("sub_block").id not in while_block_info:
+                        while_block_info[op.attr("sub_block").id] = {}
+                    while_block_info[op.attr("sub_block").id][
+                        "op_id"] = op.desc.id()
+                    while_block_info[op.attr("sub_block").id][
+                        "actual_process_mesh"] = _get_while_op_actual_process_mesh(
+                            op, auto_parallel_main_prog, rank_id, dist_context)
+                else:
+                    process_meshes = _get_op_process_meshes(op, dist_context)
+                input_vars = None
+                if op.type == "while":
+                    input_var_names = op.input("X")
+                else:
+                    input_var_names = op.input_arg_names
+                idx_offset = 0
+                for var_name in op.input_arg_names:
+                    # skip lod_tensor_blocking_queue_0
+                    if var_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    for process_mesh in process_meshes:
+                        if dist_tensor is not None and _need_reshard(
+                                dist_tensor, dist_op, process_mesh,
+                                auto_parallel_main_prog, dist_context):
+                            reshard_op_desc = find_op_desc_seq(
+                                dist_tensor, dist_op, process_mesh, batch_size)
+                            parse_op_desc(block, rank_id, reshard_op_desc,
+                                          var_name, op, dist_context,
+                                          auto_parallel_main_prog, process_mesh)
+                            cur_op_count = len(block.ops)
+                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None:
-            idx_offset = 0
-            for var_name in op.input_arg_names:
-                # skip lod_tensor_blocking_queue_0
-                if var_name == "lod_tensor_blocking_queue_0":
-                    continue
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op):
-                    reshard_op_desc = find_op_desc_seq(dist_tensor, dist_op)
-                    parse_op_desc(auto_parallel_main_prog, rank_id,
-                                  reshard_op_desc, var_name, op, dist_context)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
-
-    # insert send and recv op if output process mesh is different from tensor process mesh
-    idx = 0
-    skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"]
-    skip_ops += _g_special_ops
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None and op.type not in skip_ops:
-            for var_name in op.output_arg_names:
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op, False):
-                    for index, item in enumerate(
-                            dist_op.dist_attr.process_mesh.processes):
-                        recv_rank = dist_tensor.dist_attr.process_mesh.processes[
-                            index]
-                        if rank_id == item:
-                            _insert_send_op(block, idx + 1, var, recv_rank)
-                        if rank_id == recv_rank:
-                            _insert_recv_op(block, idx + 1, var, item)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
+        # insert send and recv op if output process mesh is different from tensor process mesh
+        idx = 0
+        # skip reader and ops whose process mesh is union
+        skip_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read", "while",
+            "write_to_array", "read_from_array"
+        ]
+        skip_ops += _g_special_ops
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None and op.type not in skip_ops:
+                for var_name in op.output_arg_names:
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    process_mesh = dist_op.dist_attr.process_mesh
+                    if dist_tensor is not None and _need_reshard(
+                            dist_tensor, dist_op, process_mesh,
+                            auto_parallel_main_prog, dist_context, False):
+                        for index, item in enumerate(
+                                dist_op.dist_attr.process_mesh.processes):
+                            recv_rank = dist_tensor.dist_attr.process_mesh.processes[
+                                index]
+                            if rank_id == item:
+                                _insert_send_op(block, idx + 1, var, recv_rank,
+                                                op.attr('op_role'))
+                            if rank_id == recv_rank:
+                                _insert_recv_op(block, idx + 1, var, item,
+                                                op.attr('op_role'))
+                        cur_op_count = len(block.ops)
+                        idx_offset = idx_offset + cur_op_count - pre_op_count
+                        pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
     # remove no need vars and ops in the main program
     remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 75e0ae251ef05e379dd1c4c330ec688e8ab6df7a..241eadcbace22cf36504e2c0ed36566fa94b9e4b 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -22,7 +22,6 @@ import logging
 from functools import reduce
 
 import paddle.fluid.core as core
-from paddle.framework.io import _to_LodTensor
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.fluid.io import is_parameter, is_belong_to_optimizer
 from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
@@ -739,7 +738,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             rank_id = paddle.distributed.get_rank()
             index = cur_attr["process_group"].index(rank_id)
             param = dist_param_dict[var_name][index]
-            dist_param_dict[var_name] = _to_LodTensor(param)
+            dist_param_dict[var_name] = param
             continue
 
         pre_param = dist_param_dict[var_name]
@@ -751,7 +750,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             dist_param_dict[var_name] = complete_param
         else:
             complete_param = pre_param[0]
-            dist_param_dict[var_name] = _to_LodTensor(complete_param)
+            dist_param_dict[var_name] = complete_param
 
         if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
             sliced_param = _slice_parameter_with_dist_attr(complete_param,
@@ -798,7 +797,7 @@ def _merge_parameter_with_dist_attr(param_list, dist_attr):
 
     assert len(partition_param_list) == 1 or not partition_param_list, \
         "Fail to merge parameter"
-    complete_param = _to_LodTensor(partition_param_list[0][0])
+    complete_param = partition_param_list[0][0]
     return complete_param
 
 
@@ -818,7 +817,7 @@ def _slice_parameter_with_dist_attr(param, dist_attr):
     rank_id = paddle.distributed.get_rank()
     sliced_param_index = _get_sliced_param_index(
         rank_id, param.shape, dims_mapping, process_shape, process_group)
-    sliced_param = _to_LodTensor(sliced_param_list[sliced_param_index])
+    sliced_param = sliced_param_list[sliced_param_index]
     return sliced_param
 
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 3731332d1e7774438d2b0369289a14f579339b07..bf6556d21e9fc78bc4bdc9d496b60dcb799b3d29 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -29,7 +29,6 @@ from ..fluid.layers import utils
 from ..fluid.dygraph import layers
 from ..fluid.dygraph.parallel import prepare_context
 import paddle
-from .fleet import fleet
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle import _C_ops
@@ -268,6 +267,10 @@ def new_group(ranks=None, backend=None):
                 place = core.NPUPlace(genv.device_id)
                 core.HCCLParallelContext(strategy,
                                          place).init_with_ring_id(ring_id)
+            elif core.is_compiled_with_mlu():
+                place = core.MLUPlace(genv.device_id)
+                core.CNCLParallelContext(strategy,
+                                         place).init_with_ring_id(ring_id)
             else:
                 assert False, ("no cuda device found")
         else:
@@ -1422,6 +1425,7 @@ def split(x,
             "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
             "ParallelColumnLinear instead.")
     else:
+        from .fleet import fleet
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
         rank = fleet.worker_index()
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index bc59b87e2ffa5c653e89c759f951de5f520773ba..236322ccfca6aad442e76af6f57c6c5f83ca59bb 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1430,6 +1430,22 @@ class Fleet(object):
 
         # cache original feed forward program
         self.origin_main_program = loss.block.program
+        # add distributed attr
+        if not hasattr(self.origin_main_program, "distributed_info_"):
+            setattr(self.origin_main_program, "distributed_info_", dict())
+            self.origin_main_program.distributed_info_[
+                "dp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "dp_degree"]
+            self.origin_main_program.distributed_info_[
+                "mp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "mp_degree"]
+            self.origin_main_program.distributed_info_[
+                "pp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "pp_degree"]
+            self.origin_main_program.distributed_info_[
+                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
+                    "sharding_degree"]
+
         context["origin_main_program"] = self.origin_main_program
         context["loss"] = loss
         if startup_program == None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 112c3887fcfa5d383d61a0dd32f6a0a73e5aea92..a2c741667ed77f7ffbd2e650ff162c2762a1ced6 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -25,10 +25,9 @@ from collections import OrderedDict
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-import paddle.distributed as dist
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
-from paddle.distributed.collective import _get_global_group
+from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait
 
 from ...utils.internal_storage import ParamStorage, GradStorage
 from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
@@ -40,8 +39,6 @@ align = {
     Type.fp32.value: 4,
 }
 
-__all__ = ["ShardingOptimizerStage2"]
-
 
 class ShardingOptimizerStage2(Optimizer):
     """
@@ -93,8 +90,8 @@ class ShardingOptimizerStage2(Optimizer):
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
-        self.group = dist.new_group(_get_global_group()
-                                    .ranks) if group is None else group
+        self.group = new_group(_get_global_group()
+                               .ranks) if group is None else group
 
         self.world_size = self.group.nranks
         self.rank = self.group.rank
@@ -136,21 +133,21 @@ class ShardingOptimizerStage2(Optimizer):
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self._update_opt_status()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
         """
 
         for p in self._local_params:
-            dist.broadcast(
+            broadcast(
                 p,
                 src=self._global_root_rank,
                 group=self.group,
                 use_calc_stream=True)
 
         # Multi stream operation will be supported later
-        dist.wait(tensor=p, group=self.group, use_calc_stream=True)
+        wait(tensor=p, group=self.group, use_calc_stream=True)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -387,12 +384,18 @@ class ShardingOptimizerStage2(Optimizer):
         raise RuntimeError(
             "optimizer.minimize() not support now, please use optimizer.step()")
 
+    def set_state_dict(self, state_dict):
+        self._optim.set_state_dict(state_dict)
+
+    def state_dict(self):
+        return self._optim.state_dict()
+
     def _clear_cache(self):
         self.__segment_params.clear()
         self._dtype_rank_params.clear()
         self._param2rank.clear()
 
-    @fluid.dygraph.no_grad
+    @paddle.autograd.no_grad()
     def _broadcast_params(self):
         """Broadcast the parameters of the current rank to each rank"""
 
@@ -401,14 +404,14 @@ class ShardingOptimizerStage2(Optimizer):
         # Exchange all the shards with the other ranks
         for dtype_per_rank in self.param_storages.values():
             for dst_rank, internal_storage in dtype_per_rank.items():
-                dist.broadcast(
+                broadcast(
                     tensor=internal_storage.buffer,
                     src=self.group.ranks[dst_rank],
                     group=self.group,
                     use_calc_stream=True)
 
             # Multi stream operation will be supported later
-            dist.wait(
+            wait(
                 tensor=internal_storage.buffer,
                 group=self.group,
                 use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 100a6882b1b35412ff1286b2363c023d4b8b9770..f786f665ad438c80988455824d6a206e3e240120 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -31,14 +31,19 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
-        self.pass_ctx = PassContext()
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(ParameterServerOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _set_origin_programs(self, losses):
+        self.origin_main_programs = []
+        for loss in losses:
+            self.origin_main_programs.append(loss.block.program)
+
     def _init_ps_pass_context(self, loss, startup_program):
+        self.pass_ctx = PassContext()
         attrs = {}
         # trainer
         attrs["env"] = get_dist_env()
@@ -46,14 +51,15 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         attrs['loss'] = loss
         attrs['min_block_size'] = 81920
         attrs['origin_main_program'] = loss.block.program
-        attrs['origin_main_programs'] = [loss.block.program]
         attrs['origin_startup_program'] = startup_program
-        attrs['origin_startup_programs'] = [startup_program]
+
+        attrs['origin_main_programs'] = self.origin_main_programs
 
         attrs['cloned_main'] = attrs['origin_main_program'].clone()
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
 
         attrs['user_defined_strategy'] = self.user_defined_strategy
+        attrs['valid_strategy'] = self.user_defined_strategy
         attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy)
         attrs['ps_mode'] = attrs['trainer'].mode
         logger.info("ps_mode: {}".format(attrs['ps_mode']))
@@ -89,10 +95,11 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return False
 
     def _can_apply(self):
-        if self._attrs['role_maker']._is_collective or self._attrs[
-                'k_steps'] < 0:
+        if self.role_maker._is_collective:
             return False
-        return True
+
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        return True if k_steps >= 0 else False
 
     def minimize_impl(self,
                       loss,
@@ -103,12 +110,37 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                                 no_grad_set)
         if startup_program == None:
             startup_program = paddle.static.default_startup_program()
+        print("program after inner optimizer minimize:",
+              str(loss.block.program))
+        self._set_origin_programs([loss])
         self._init_ps_pass_context(loss, startup_program)
         ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
             self.pass_ctx)
         ps_builder._build_programs()
         return None, None
 
+    def minimize_losses_impl(self,
+                             losses,
+                             startup_program=None,
+                             parameter_list=None,
+                             no_grad_set=None):
+        if parameter_list is None:
+            parameter_list = [None] * len(losses)
+        for idx, loss in enumerate(losses):
+            startup_prog = startup_program[idx]
+            parameters = parameter_list[idx]
+            self.inner_opt.minimize(loss, startup_prog, parameters, no_grad_set)
+        self._set_origin_programs(losses)
+        for idx, loss in enumerate(losses):
+            print("ps_optimizer idx loss:", idx, loss)
+            startup_prog = startup_program[idx]
+            self._init_ps_pass_context(loss, startup_prog)
+            ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
+                self.pass_ctx)
+            ps_builder._build_programs()
+            startup_program[idx] = self.pass_ctx._attrs['cloned_startup']
+        return None, None
+
     def _can_apply_geo(self, program):
         def get_sys_free_mem():
             plat = platform.system()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index d04a3a53db3e2b6b0bc64e32dfde8a85732d6ec9..b42f21989abd77679993a1c8b52681351e4dfb40 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -901,9 +901,10 @@ def save_persistables(exe, dirname, main_program, filename=None):
     def is_opt_vars(var):
         # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
         # now only Momentum and adam are compatible with sharding
+        # support EMA optimizer
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
-            "_velocity_0"
+            "_velocity_0", "_ema_0"
         ]
         for check in checks:
             if var.name.endswith(check) and var.persistable:
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 392a7f3ac5d8fe01ec2b5fdf0b36030d79124be4..c6f05023e6138597dfd906cd53854b70231d6130 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -28,7 +28,7 @@ from types import MethodType
 
 import paddle
 from paddle import nn
-import paddle.distributed as dist
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from ...utils.internal_storage import GradStorage
@@ -63,8 +63,7 @@ class ShardingStage2(nn.Layer):
             sync_buffers=False,
             buffer_max_size=2**23,  #8MB
             auto_refresh_trainable=True,
-            device="gpu",
-            use_grad_storage=True):
+            device="gpu"):
         super().__init__()
 
         # training options
@@ -102,9 +101,10 @@ class ShardingStage2(nn.Layer):
         # Set grad storage size & Display param sizes and model sizes
         model_size = sum(
             [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        assert buffer_max_size >= 0, "buffer_max_size must be GE than 0."
         self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                        model_size)
-        self._use_grad_storage = use_grad_storage
+        self._use_grad_storage = buffer_max_size > 0
         self._grad_storages = {}  # {dtype: {rank: GradStorage}}
         self._has_grad_storage = []
         self._grad_storage_list = []
@@ -158,6 +158,17 @@ class ShardingStage2(nn.Layer):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _clear_gradients(self):
         """
         Set zero to the gradient of the optimizer's current rank trainable parameters.
@@ -255,7 +266,7 @@ class ShardingStage2(nn.Layer):
         # wait next func hook support
         self._setup_backward_hooks()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def __sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -277,7 +288,7 @@ class ShardingStage2(nn.Layer):
         except AttributeError:
             return getattr(self._layer, name)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _clear_counters(self):
         """Reset all the grad reduce and call counters."""
         if self.training:
@@ -290,13 +301,13 @@ class ShardingStage2(nn.Layer):
     def _get_reduce_fn(self, index, param, dst_rank):
         """
         There are two ways to reduce gradient.
-        - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately.
+        - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately.
         - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks.
         """
 
         if not self._use_grad_storage or not self._has_grad_storage[index]:
             # Direct reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -336,7 +347,7 @@ class ShardingStage2(nn.Layer):
 
         else:
             # Buffer reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -421,9 +432,6 @@ class ShardingStage2(nn.Layer):
         Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters.
         """
 
-        if not self._use_grad_storage:
-            return
-
         # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank
         self._grad_storages = {}
         self._has_grad_storage = [False for _ in self._trainable_params]
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index de69836fdba14da14c179c5e00fa4cc14481e534..9886ca4e2deace4c625ead51852841e7c761be21 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -20,7 +20,6 @@ import logging
 import functools
 import numpy as np
 from itertools import chain
-from functools import reduce
 from types import MethodType
 from collections import deque, OrderedDict
 
@@ -28,9 +27,9 @@ import paddle
 from paddle import nn
 from paddle.autograd import PyLayer
 import paddle.fluid.core as core
-import paddle.distributed as dist
 from paddle.fluid.framework import ParamBase
 from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from .sharding_utils import Type, ShardingClipGrad, device_guard
@@ -84,6 +83,7 @@ class ShardingStage3(nn.Layer):
         self._offload = offload
         self._sync_comm = sync_comm
         # segmentation size
+        assert segment_size >= 0, "segment_size must be GE than 0."
         self._segment_size = segment_size
 
         global DEV
@@ -158,7 +158,7 @@ class ShardingStage3(nn.Layer):
         self._redefine_opt_step()
         self._redefine_opt_clear()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
@@ -248,6 +248,17 @@ class ShardingStage3(nn.Layer):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _handle_unslice_params(self):
         buffer_size = dict()
         buffer_size[Type.fp32.value] = 0
@@ -408,7 +419,7 @@ class ShardingStage3(nn.Layer):
         # register post forward hooks
         sub_layer.register_forward_post_hook(_forward_post_hook)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -521,8 +532,8 @@ class ShardingStage3(nn.Layer):
             param._register_backward_hook(allreduce_function)
 
     def _get_allreduce_fn(self, param):
-        @paddle.no_grad()
-        def reduce(*_):
+        @paddle.autograd.no_grad()
+        def allreduce_(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
                 # Only support sync allreduce current rank's layer now
@@ -572,7 +583,7 @@ class ShardingStage3(nn.Layer):
                     if self._offload:
                         param.fw_storage = _device2cpu(param.fw_storage, True)
 
-        return reduce
+        return allreduce_
 
     def _param2align(self, param):
         # CUDA alignment 256 bytes
@@ -840,7 +851,7 @@ def _allgather_buffer(trainable_params,
     return task_flow
 
 
-@paddle.no_grad()
+@paddle.autograd.no_grad()
 def _create_params_grad(trainable_params, param2buffer_size, task_flow):
     for param in trainable_params:
         if param.name in task_flow.full_grad.keys():
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 0a42b993d5bf2387d1110ae5478b6162ce175483..89b59254e5b9105a55c68f3ef871396de1bd9199 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -21,7 +21,6 @@ import numpy as np
 from types import MethodType
 
 import paddle
-import paddle.distributed as dist
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index dccd7f6205302663117c0957c19138270bf32feb..4ccb48ef72e714e1739a6e9da88374cd4ce17ed4 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -182,9 +182,10 @@ class RecomputeFunction(PyLayer):
                     "none of output has requires_grad=True, this recompute() is not necessary"
                 )
 
-            # actually backward            
-            paddle.autograd.backward(forward_outputs_with_grad,
-                                     backward_inputs_with_grad)
+            # actually backward
+            with paddle.amp.auto_cast(enable=False):
+                paddle.autograd.backward(forward_outputs_with_grad,
+                                         backward_inputs_with_grad)
 
             grads = list(inp._grad_ivar() for inp in detached_inputs
                          if isinstance(inp, core.VarBase))
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 177e19194a52277d99c7f3b7904bb00a3962f4c6..16ed528b64f0c87c594a9e302bf2ae398c0500c5 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -58,9 +58,9 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and (
+    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and (
             core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
-            core.is_compiled_with_npu()):
+            core.is_compiled_with_npu() or core.is_compiled_with_mlu()):
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -152,7 +152,8 @@ def init_parallel_env():
     is_cpu_only = _is_cpuonly(backend)
     # 1. gpu xpu check, must be gpu or xpu, 
     if not (is_cpu_only or core.is_compiled_with_cuda() or
-            core.is_compiled_with_xpu() or core.is_compiled_with_npu()):
+            core.is_compiled_with_xpu() or core.is_compiled_with_npu() or
+            core.is_compiled_with_mlu()):
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
@@ -162,6 +163,8 @@ def init_parallel_env():
         _check_var_exists('FLAGS_selected_xpus')
     elif not is_cpu_only and core.is_compiled_with_npu():
         _check_var_exists('FLAGS_selected_npus')
+    elif not is_cpu_only and core.is_compiled_with_mlu():
+        _check_var_exists('FLAGS_selected_mlus')
 
     _check_var_exists("PADDLE_TRAINER_ID")
     _check_var_exists("PADDLE_CURRENT_ENDPOINT")
@@ -213,6 +216,8 @@ def init_parallel_env():
         place = core.XPUPlace(parallel_env.device_id)
     elif core.is_compiled_with_npu():
         place = core.NPUPlace(parallel_env.device_id)
+    elif core.is_compiled_with_mlu():
+        place = core.MLUPlace(parallel_env.device_id)
 
     _set_expected_place(place)
     # init nccl or hccl or bkcl or heter context
@@ -231,6 +236,9 @@ def init_parallel_env():
     elif core.is_compiled_with_npu():
         parallel_helper._set_parallel_ctx(
             core.HCCLParallelContext(strategy, place))
+    elif core.is_compiled_with_mlu():
+        parallel_helper._set_parallel_ctx(
+            core.CNCLParallelContext(strategy, place))
 
     if backend != "heter":
         other_endpoints = strategy.trainer_endpoints[:]
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 284365ce06651ae6ca8e9ff5ad27fd92d33e8dae..6f72cf1b1597092b28f5762f1b3330b396ea6401 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -74,6 +74,8 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
+        print("pass loss program id:", id(attrs['loss'].block.program))
+        print("pass main program id:", id(main_program))
         ps_mode = attrs['ps_mode']
         if ps_mode == DistributedMode.GEO:
             send_ctx = get_geo_trainer_send_context(attrs)  # geo 模式
@@ -84,6 +86,8 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
         for merged_name, send in send_ctx.items():
             if send.is_sparse() and ps_mode != DistributedMode.GEO:
                 continue
+            if send.program_id() != id(attrs['loss'].block.program):
+                continue
             logger.info('merged_name, send: {}, {}'.format(merged_name, send))
             is_sparse = 1 if send.is_sparse() else 0
             is_sparse = 2 if send.is_distributed() else is_sparse
@@ -496,6 +500,7 @@ class DeleteOptimizesPass(PassBase):
             persistable=True)
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
+        print("delete_optimizer_pass")
         attrs = pass_ctx._attrs
         optimizer_ops = get_optimize_ops(main_program)
         lr_ops = get_lr_ops(main_program)
diff --git a/python/paddle/distributed/ps/README.md b/python/paddle/distributed/ps/README.md
deleted file mode 100755
index 8d28031794f5d2072e88a30297ae8c629d59fc15..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/ps/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# 目录说明
-
-* 改完之后，上层目录中 fleet 中相关文件（夹）就可以删除
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 14a68ad91674762e8c786cfc55d7032edb62f7f4..5170684b4325c1d2ab6723a0b8d8989cf9c21aa2 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -15,10 +15,11 @@
 import warnings
 
 import os
+from paddle.distributed.fleet.proto import ps_pb2
 import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
 from paddle.fluid import core
-from .utils.public import *
+from paddle.distributed.ps.utils.public import *
 from paddle.fluid.framework import Program
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
@@ -29,26 +30,22 @@ from paddle.distributed.fleet.base.private_helper_function import wait_server_re
 from paddle.fluid.communicator import Communicator, HeterClient
 from google.protobuf import text_format
 
-__all__ = []
-
-
-def conv_indent(indent):
-    return "".join([" "] * indent)
-
-
-PSERVER_SAVE_SUFFIX = ".shard"
+__all__ = [
+    'Table', 'SparseTable', 'GeoSparseTable', 'BarrierTable', 'TensorTable',
+    'DenseTable'
+]
 
 
 def get_program_by_id(context, program_id):
     programs = context["origin_main_programs"]
     for i, program in enumerate(programs):
         if id(program) == program_id:
-            return program, context["origin_startup_programs"][i]
-    return None, None
+            return program, context["origin_startup_programs"][i], i
+    return None, None, None
 
 
 def parse_table_class(varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     for op in main_program.global_block().ops:
         if not is_distributed_sparse_op(op) and not is_sparse_op(op):
             continue
@@ -62,129 +59,140 @@ def parse_table_class(varname, program_id, context):
                 return "MemorySparseTable"
 
 
-def get_default_accessor_proto(accessor, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
-    embedding_dim = 0
-    for var in main_program.list_vars():
-        if var.name == varname:
-            embedding_dim = var.shape[1]
-            break
-
-    if not accessor.HasField("accessor_class"):
-        accessor.accessor_class = "CtrCommonAccessor"
-    if not accessor.HasField("fea_dim"):
-        accessor.fea_dim = embedding_dim + 2
-    if not accessor.HasField("embedx_dim"):
-        accessor.embedx_dim = embedding_dim - 1
-    if not accessor.HasField("embedx_threshold"):
-        accessor.embedx_threshold = 0
-
-    ctr_accessor_param = accessor.ctr_accessor_param
-    if not ctr_accessor_param.HasField("nonclk_coeff"):
-        ctr_accessor_param.nonclk_coeff = 0.1
-    if not ctr_accessor_param.HasField("click_coeff"):
-        ctr_accessor_param.click_coeff = 1.0
-    if not ctr_accessor_param.HasField("base_threshold"):
-        ctr_accessor_param.base_threshold = 0
-    if not ctr_accessor_param.HasField("delta_threshold"):
-        ctr_accessor_param.delta_threshold = 0
-    if not ctr_accessor_param.HasField("delta_keep_days"):
-        ctr_accessor_param.delta_keep_days = 16
-    if not ctr_accessor_param.HasField("show_click_decay_rate"):
-        ctr_accessor_param.show_click_decay_rate = 1
-    if not ctr_accessor_param.HasField("delete_threshold"):
-        ctr_accessor_param.delete_threshold = 0
-    if not ctr_accessor_param.HasField("delete_after_unseen_days"):
-        ctr_accessor_param.delete_after_unseen_days = 30
-    if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
-        ctr_accessor_param.ssd_unseenday_threshold = 1
-
-    for sgd_param in [accessor.embed_sgd_param, accessor.embedx_sgd_param]:
-        if not sgd_param.HasField("name"):
-            sgd_param.name = "SparseAdaGradSGDRule"
-        if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
-            if not sgd_param.adagrad.HasField("learning_rate"):
-                sgd_param.adagrad.learning_rate = 0.05
-            if not sgd_param.adagrad.HasField("initial_g2sum"):
-                sgd_param.adagrad.initial_g2sum = 3.0
-            if not sgd_param.adagrad.HasField("initial_range"):
-                sgd_param.adagrad.initial_range = 0.0001
-            if len(sgd_param.adagrad.weight_bounds) == 0:
-                sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
-        if sgd_param.name == "SparseNaiveSGDRule":
-            if not sgd_param.naive.HasField("learning_rate"):
-                sgd_param.naive.learning_rate = 0.05
-            if not sgd_param.naive.HasField("initial_range"):
-                sgd_param.naive.initial_range = 0.0001
-            if len(sgd_param.naive.weight_bounds) == 0:
-                sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
-        if sgd_param.name == "SparseAdamSGDRule":
-            if not sgd_param.adam.HasField("learning_rate"):
-                sgd_param.adam.learning_rate = 0.001
-            if not sgd_param.adam.HasField("initial_range"):
-                sgd_param.adam.initial_range = 0.0001
-            if not sgd_param.adam.HasField("beta1_decay_rate"):
-                sgd_param.adam.beta1_decay_rate = 0.9
-            if not sgd_param.adam.HasField("beta2_decay_rate"):
-                sgd_param.adam.beta2_decay_rate = 0.999
-            if not sgd_param.adam.HasField("ada_epsilon"):
-                sgd_param.adam.ada_epsilon = 1e-08
-            if len(sgd_param.adam.weight_bounds) == 0:
-                sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
-
-
-def check_embedding_dim(accessor, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+def check_embedding_dim(accessor_proto, varname, program_id, context):
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     embedding_dim = 0
     for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
+            print('new var: {}, {}, {}'.format(var, embedding_dim,
+                                               accessor_proto.fea_dim))
             break
-    fea_dim = accessor.fea_dim
+    fea_dim = accessor_proto.fea_dim
     if fea_dim != embedding_dim + 2:
         raise ValueError(
             "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
             format(embedding_dim + 2, fea_dim))
-    embedx_dim = accessor.embedx_dim
+    embedx_dim = accessor_proto.embedx_dim
     if embedx_dim != embedding_dim - 1:
         raise ValueError(
             "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
             format(embedding_dim - 1, embedx_dim))
 
 
-class Accessor:
+class Service:
     def __init__(self):
-        self.accessor_class = ""
-        self.optimizer = None
-        self.feature_dim = -1
-        self.embedding_dim = -1
-        self.optimizer = None
+        pass
+
+    def _set(self, service_proto):
+        service_proto.server_class = "BrpcPsServer"
+        service_proto.client_class = "BrpcPsClient"
+        service_proto.service_class = "BrpcPsService"
+        service_proto.start_server_port = 0
+        service_proto.server_thread_num = 12
+
+
+class GpuService(Service):
+    def __init__(self):
+        super(GpuService, self).__init__()
 
-    def to_string(self, indent):
-        accessor_str = "{}accessor {{{}\n{}}}"
-        attrs = ""
-        attrs += "accessor_class: \"{}\" ".format(self.accessor_class)
-        attrs += "fea_dim: {} ".format(self.feature_dim)
-        attrs += "embedx_dim: {} ".format(self.embedding_dim)
-        attrs += "\n"
-        if self.optimizer is not None:
-            attrs += self.optimizer.to_string(indent)
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    def _set(self, service_proto):
+        service_proto.server_class = 'PsLocalServer'
+        service_proto.client_class = 'PsLocalClient'
 
 
-class CommonAccessor:
+class Accessor:
     def __init__(self):
         self.accessor_class = ""
-        self.table_name = None
-        self.entry = None
+        self.optimizer = None
+        self.feature_dim = 0
+        self.embedding_dim = 0
+
+    # TableAccessorParameter accessor
+    def _set(self, accessor_proto, varname, program_id, context):
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
+        embedding_dim = 0
+        for var in main_program.list_vars():
+            if var.name == varname:
+                embedding_dim = var.shape[1]
+                break
+
+        if not accessor_proto.HasField("accessor_class"):
+            accessor_proto.accessor_class = "CtrCommonAccessor"
+        if not accessor_proto.HasField("fea_dim"):
+            accessor_proto.fea_dim = embedding_dim + 2
+        if not accessor_proto.HasField("embedx_dim"):
+            accessor_proto.embedx_dim = embedding_dim - 1
+        if not accessor_proto.HasField("embedx_threshold"):
+            accessor_proto.embedx_threshold = 0
+
+        ctr_accessor_param = accessor_proto.ctr_accessor_param
+        if not ctr_accessor_param.HasField("nonclk_coeff"):
+            ctr_accessor_param.nonclk_coeff = 0.1
+        if not ctr_accessor_param.HasField("click_coeff"):
+            ctr_accessor_param.click_coeff = 1.0
+        if not ctr_accessor_param.HasField("base_threshold"):
+            ctr_accessor_param.base_threshold = 0
+        if not ctr_accessor_param.HasField("delta_threshold"):
+            ctr_accessor_param.delta_threshold = 0
+        if not ctr_accessor_param.HasField("delta_keep_days"):
+            ctr_accessor_param.delta_keep_days = 16
+        if not ctr_accessor_param.HasField("show_click_decay_rate"):
+            ctr_accessor_param.show_click_decay_rate = 1
+        if not ctr_accessor_param.HasField("delete_threshold"):
+            ctr_accessor_param.delete_threshold = 0
+        if not ctr_accessor_param.HasField("delete_after_unseen_days"):
+            ctr_accessor_param.delete_after_unseen_days = 30
+        if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
+            ctr_accessor_param.ssd_unseenday_threshold = 1
+
+        for sgd_param in [
+                accessor_proto.embed_sgd_param, accessor_proto.embedx_sgd_param
+        ]:
+            if not sgd_param.HasField("name"):
+                sgd_param.name = "SparseAdaGradSGDRule"
+            if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
+                if not sgd_param.adagrad.HasField("learning_rate"):
+                    sgd_param.adagrad.learning_rate = 0.05
+                if not sgd_param.adagrad.HasField("initial_g2sum"):
+                    sgd_param.adagrad.initial_g2sum = 3.0
+                if not sgd_param.adagrad.HasField("initial_range"):
+                    sgd_param.adagrad.initial_range = 0.0001
+                if len(sgd_param.adagrad.weight_bounds) == 0:
+                    sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
+            if sgd_param.name == "SparseNaiveSGDRule":
+                if not sgd_param.naive.HasField("learning_rate"):
+                    sgd_param.naive.learning_rate = 0.05
+                if not sgd_param.naive.HasField("initial_range"):
+                    sgd_param.naive.initial_range = 0.0001
+                if len(sgd_param.naive.weight_bounds) == 0:
+                    sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
+            if sgd_param.name == "SparseAdamSGDRule":
+                if not sgd_param.adam.HasField("learning_rate"):
+                    sgd_param.adam.learning_rate = 0.001
+                if not sgd_param.adam.HasField("initial_range"):
+                    sgd_param.adam.initial_range = 0.0001
+                if not sgd_param.adam.HasField("beta1_decay_rate"):
+                    sgd_param.adam.beta1_decay_rate = 0.9
+                if not sgd_param.adam.HasField("beta2_decay_rate"):
+                    sgd_param.adam.beta2_decay_rate = 0.999
+                if not sgd_param.adam.HasField("ada_epsilon"):
+                    sgd_param.adam.ada_epsilon = 1e-08
+                if len(sgd_param.adam.weight_bounds) == 0:
+                    sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
+
+
+class CommonAccessor(Accessor):
+    def __init__(self):
+        super(CommonAccessor, self).__init__()
+        self.table_name = ''
+        self.entry = 'none'
         self.attrs = []
         self.params = []
         self.dims = []
         self.trainer_num = 0
-        self.sync = "false"
-        self.table_num = None
-        self.table_dim = None
+        self.sync = False
         self.initializers = []
         self.opt_input_map = {}
         self.opt_attr_map = {}
@@ -228,7 +236,8 @@ class CommonAccessor:
         self.opt_init_map = opt_init_map
 
     def parse_entry(self, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
@@ -282,8 +291,8 @@ class CommonAccessor:
         print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
             ctx.table_id(), ctx.is_datanorm_table()))
 
-        main_program, startup_program = get_program_by_id(context,
-                                                          ctx.program_id())
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
@@ -351,10 +360,11 @@ class CommonAccessor:
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
                     #TODO: for dense learning_rate, can be different from sparse lr
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     initializer = self.get_initializer_attr(param.name,
                                                             startup_program)
@@ -396,10 +406,11 @@ class CommonAccessor:
                 else:
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     if shape is None:
                         if is_sparse:
@@ -422,233 +433,364 @@ class CommonAccessor:
         self.initializers = initializers
         self.attrs = attrs
 
-    def to_string(self, indent):
-        accessor_str = "{}common {{{}\n{}}}"
-        attrs = ""
-        attrs += "name: \"{}\" ".format(self.accessor_class)
-
-        if self.table_name:
-            attrs += "table_name: \"{}\" ".format(self.table_name)
-
-        if self.entry:
-            attrs += "entry: \"{}\" ".format(self.entry)
-        attrs += "trainer_num: {} ".format(self.trainer_num)
-        attrs += "sync: {} ".format(self.sync)
-        if self.table_num:
-            attrs += "table_num: {} ".format(self.table_num)
-        if self.table_dim:
-            attrs += "table_dim: {} ".format(self.table_dim)
-
-        for param in self.params:
-            attrs += "params: \"{}\" ".format(param)
-
-        for dim in self.dims:
-            attrs += "dims: {} ".format(dim)
-
-        for initializer in self.initializers:
-            attrs += "initializers: \"{}\" ".format(initializer)
-
-        attrs += "\n"
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    # CommonAccessorParameter common
+    def _set(self, proto):
+        proto.name = self.accessor_class
+        proto.table_name = self.table_name
+        proto.params.extend(self.params)
+        proto.dims.extend(self.dims)
+        proto.initializers.extend(self.initializers)
+        proto.entry = self.entry
+        proto.trainer_num = self.trainer_num
+        proto.sync = self.sync
+        proto.table_num = self.table_num
+        proto.table_dim = self.table_dim
 
 
 class Tensor:
-    def __init__(self):
-        self.main_program_id = None
-        self.startup_program_id = None
-        self.feed_var_name = None
-        self.fetch_var_name = None
-        self.tensor_table_class = False
-
-    def to_string(self, indent):
-        program_str = "{}tensor {{{}\n{}}}"
-        attrs = ""
-        attrs += "feed_var_name: \"{}\" ".format(str(self.feed_var_name))
-        attrs += "fetch_var_name: \"{}\" ".format(str(self.fetch_var_name))
-        attrs += "startup_program_id: {} ".format(str(self.startup_program_id))
-        attrs += "main_program_id: {} ".format(str(self.main_program_id))
-        attrs += "tensor_table_class: \"{}\" ".format(
-            str(self.tensor_table_class))
-        attrs += "\n"
-        return program_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    def __init__(self, tesnor_dcit):
+        self.tensor_dict = tesnor_dcit
+
+    def _set(self, tensor_proto):
+        tensor_proto.main_program_id = self.tensor_dict.get("main_program_id",
+                                                            0)
+        tensor_proto.startup_program_id = self.tensor_dict.get(
+            "startup_program_id", 0)
+        tensor_proto.feed_var_name = self.tensor_dict.get("feed_var_name", '')
+        tensor_proto.fetch_var_name = self.tensor_dict.get("fetch_var_name", '')
+        tensor_proto.tensor_table_class = self.tensor_dict.get(
+            "tensor_table_class", '')
 
 
 class Table:
     def __init__(self):
-        self.id = -1
         self.table_class = None
         self.shard_num = -1
         self.type = None
-        self.accessor = None
-        self.common = None
+        self.accessor = Accessor()
+        self.shard_num = 256
+        self.common = CommonAccessor()
         self.tensor = None
-        self.accessor_proto = None
-
-    def to_string(self, indent):
-        # if self.id == 1:
-        #     proto_txt = ''
-        #     with open('./sparse_table.prototxt') as f:
-        #         proto_txt = f.read()
-        #     return proto_txt
-        table_str = "{}downpour_table_param {{{}\n{}}}"
-
-        attrs = ""
-        attrs += "table_id: {} ".format(self.id)
-        attrs += "table_class: \"{}\" ".format(self.table_class)
-        attrs += "shard_num: {} ".format(self.shard_num)
-        attrs += "type: {}".format(self.type)
-        attrs += "\n"
-        indent += 2
-
-        if self.accessor_proto is not None:
-            accessor_str = "{}accessor {{{}\n{}}}"
-            accessor_str = accessor_str.format(
-                conv_indent(indent), self.accessor_proto, conv_indent(indent))
-            attrs += accessor_str + "\n"
-        elif self.accessor is not None:
-            attrs += self.accessor.to_string(indent)
-            attrs += "\n"
-
-        if self.tensor is not None:
-            attrs += self.tensor.to_string(indent)
-            attrs += "\n"
-
-        if self.common is not None:
-            attrs += self.common.to_string(indent)
-            attrs += "\n"
-
-        return table_str.format(conv_indent(indent), attrs, conv_indent(indent))
 
+    def _set(self, table_proto):
+        pass
 
-class Service:
-    def __init__(self):
-        self.server_class = "BrpcPsServer"
-        self.client_class = "BrpcPsClient"
-        self.service_class = "BrpcPsService"
-        self.start_server_port = 0
-        self.server_thread_num = 12
 
-    def to_string(self, indent):
-        service_str = "{}service_param {{{}\n{}}}"
+class BarrierTable(Table):
+    def __init__(self, context, idx):
+        super(BarrierTable, self).__init__()
+        self.type = None
+        self.shard_num = 256
+        self.accessor.accessor_class = 'CommMergeAccessor'
+        self.common.attrs = ""
+        self.common.dims = []
+        self.common.params = []
+        self.is_heter_ps_mode = context['is_heter_ps_mode']
+        self.role_maker = context['role_maker']
+        self.idx = idx
+        self.is_sync = context['is_sync']
+
+    def _set(self, table_proto):
+        table_proto.table_id = self.idx
+        table_proto.table_class = 'BarrierTable'
+        table_proto.shard_num = 256
+        table_proto.type = ps_pb2.PS_OTHER_TABLE
+
+        table_proto.accessor.accessor_class = "CommMergeAccessor"
+        table_proto.accessor.fea_dim = 0
+        table_proto.accessor.embedx_dim = 0
+
+        table_proto.common.name = ""
+        table_proto.common.table_name = "barrier_table"
+        table_proto.common.sync = self.is_sync
+        table_proto.common.entry = 'none'
+
+        trainer_num = get_trainers(self.role_maker)
+        if self.is_heter_ps_mode:
+            trainer_num += len(self.role_maker._get_heter_worker_endpoints())
+        table_proto.common.trainer_num = trainer_num
+
 
-        attrs = ""
-        attrs += "server_class: \"{}\" ".format(self.server_class)
-        attrs += "client_class: \"{}\" ".format(self.client_class)
-        attrs += "service_class: \"{}\" ".format(self.service_class)
-        attrs += "start_server_port: {} ".format(self.start_server_port)
-        attrs += "server_thread_num: {} ".format(self.server_thread_num)
+class TensorTable(Table):
+    def __init__(self, idx, tensor_dict, role_maker):
+        super(TensorTable, self).__init__()
+        self.idx = idx
+        self.tensor_dict = tensor_dict
+        self.role_maker = role_maker
 
-        return service_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    def _set(self, table_proto):
+        table_proto.table_id = self.idx
+        table_proto.type = ps_pb2.PS_OTHER_TABLE
+        table_proto.table_class = self.tensor_dict.get("tensor_table_class", '')
 
+        table_proto.accessor.accessor_class = "CommMergeAccessor"
 
-class DownpourServer:
-    def __init__(self):
-        self.service = None
-        self.tables = []
+        table_proto.common.table_name = self.tensor_dict.get("feed_var_name",
+                                                             '')
+        table_proto.common.trainer_num = get_trainers(self.role_maker)
+
+        tensor = Tensor(self.tensor_dict)
+        tensor._set(table_proto.tensor)
 
-    def set_service_param(self, service):
-        self.service = service
 
-    def append_tables(self, table):
-        if not isinstance(table, Table):
-            raise ValueError("only support instance Table")
-        self.tables.append(table)
+class SparseTable(Table):
+    def __init__(self, context, send_ctx):
+        super(SparseTable, self).__init__()
+        self.context = context
+        self.ctx = send_ctx
+        self.type = None
+        self.table_class = 'MemorySparseTable'
+        self.accessor = Accessor()
+
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == False):
+            return
+        table_proto.table_id = ctx.table_id()
+        table_proto.table_class = self.table_class
+        table_proto.type = ps_pb2.PS_SPARSE_TABLE
+        table_proto.shard_num = self.shard_num
+
+        self.common.table_name = self.context['grad_name_to_param_name'][
+            ctx.origin_varnames()[0]]
+
+        print('new table_name: {}'.format(self.common.table_name))
+        all_table_proto = self.context[
+            "user_defined_strategy"].sparse_table_configs
+        usr_table_proto = all_table_proto.add()
+        for proto in all_table_proto:
+            if proto.table_name == self.common.table_name:
+                usr_table_proto = proto
+                break
+        table_proto.table_class = 'MemorySparseTable'
+        warnings.warn("The PS mode must use MemorySparseTable.")
+        if usr_table_proto.HasField("shard_num"):
+            table_proto.shard_num = usr_table_proto.shard_num
+        else:
+            table_proto.shard_num = 1000
+            warnings.warn(
+                "The shard_num of sparse table is not set, use default value 1000."
+            )
+
+        if usr_table_proto.accessor.ByteSize() == 0:
+            warnings.warn(
+                "The accessor of sparse table is not set, use default value.")
 
-    def to_string(self, indent):
-        server_str = "{}downpour_server_param {{{}\n{}}}"
+        table_proto.accessor.ParseFromString(
+            usr_table_proto.accessor.SerializeToString())
+        self.accessor._set(table_proto.accessor, self.common.table_name,
+                           ctx.program_id(), self.context)
 
-        table_strs = ""
-        indent += 2
+        check_embedding_dim(table_proto.accessor, self.common.table_name,
+                            ctx.program_id(), self.context)
 
-        table_strs += "\n"
-        table_strs += self.service.to_string(indent)
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = True if self.context['is_sync'] else False
 
-        for table in self.tables:
-            table_strs += "\n"
-            table_strs += table.to_string(indent)
-        return server_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+        self.common._set(table_proto.common)
 
 
-class Server:
-    def __init__(self):
-        self.servers = []
+class GeoSparseTable(SparseTable):
+    def __init__(self, context, send_ctx):
+        super(GeoSparseTable, self).__init__(context, send_ctx)
+        self.table_class = "SparseGeoTable"
+        if self.context['ps_mode'] != DistributedMode.GEO:
+            raise ValueError("not geo sparse table!")
 
-    def add_server(self, server):
-        if not isinstance(server, DownpourServer):
-            raise ValueError("only support instance DownpourServer")
-        self.servers.append(server)
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == False):
+            return
+        table_proto.table_id = ctx.table_id()
+        table_proto.table_class = self.table_class
+        table_proto.type = ps_pb2.PS_SPARSE_TABLE
+        table_proto.shard_num = self.shard_num
+
+        table_proto.accessor.accessor_class = 'CommMergeAccessor'
+        table_proto.accessor.fea_dim = ctx.sections()[0]
+        table_proto.accessor.embedx_dim = ctx.sections()[1]
+
+        self.common.table_name = self.context['grad_name_to_param_name'][
+            ctx.origin_varnames()[0]]
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = False
+        self.common._set(table_proto.common)
+
+
+class DenseTable(Table):
+    def __init__(self, context, send_ctx):
+        super(DenseTable, self).__init__()
+        self.context = context
+        self.ctx = send_ctx
+        self.accessor = Accessor()
+
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == True):
+            return
+
+        table_proto.table_id = ctx.table_id()
+
+        table_proto.type = ps_pb2.PS_DENSE_TABLE
+        table_proto.table_class = "CommonDenseTable"
+        table_proto.shard_num = 256
 
-    def __str__(self):
-        server_str = "server_param {{{}\n}}"
-        indent = 2
-        servers_str = ""
-        for server in self.servers:
-            servers_str += "\n"
-            servers_str += server.to_string(indent)
+        table_proto.accessor.accessor_class = 'CommMergeAccessor'
+        table_proto.accessor.fea_dim = ctx.sections()[0]
+        table_proto.accessor.embedx_dim = 1
 
-        return server_str.format(servers_str)
+        self.common.table_name = "MergedDense"
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = True if self.context['is_sync'] else False
+
+        self.common._set(table_proto.common)
 
 
-class DownpourWorker:
+class Server:
     def __init__(self):
-        self.tables = []
+        pass
 
-    def append_tables(self, table):
-        if not isinstance(table, Table):
-            raise ValueError("only support instance Table")
-        self.tables.append(table)
+    def _set(self):
+        pass
 
-    def to_string(self, indent):
-        worker_str = "{}downpour_worker_param {{{}\n{}}}"
-        table_strs = ""
-        indent += 2
-        for table in self.tables:
-            table_strs += "\n"
-            table_strs += table.to_string(indent)
 
-        return worker_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+class DownpourServer(Server):
+    def __init__(self):
+        super(DownpourServer, self).__init__()
+
+    def _set(self):
+        pass
 
 
 class Worker:
     def __init__(self):
-        self.workers = []
+        pass
+
+    def _set(self):
+        pass
 
-    def add_worker(self, worker):
-        if not isinstance(worker, DownpourWorker):
-            raise ValueError("only support instance DownpourWorker")
-        self.workers.append(worker)
 
-    def __str__(self):
-        worker_str = "worker_param {{{}\n}}"
-        indent = 2
-        workers_str = ""
-        for worker in self.workers:
-            workers_str += "\n"
-            workers_str += worker.to_string(indent)
+class DownpourWorker(Worker):
+    def __init__(self):
+        super(DownpourWorker, self).__init__()
 
-        return worker_str.format(workers_str)
+    def _set(self):
+        pass
 
 
 class fsClient:
-    def __init__(self, proto):
-        self.proto = proto
-        self.uri = proto.uri
-        self.user = proto.user
-        self.passwd = proto.passwd
-        self.hadoop_bin = proto.hadoop_bin
-
-    def to_string(self):
-        proto_txt = text_format.MessageToString(self.proto)
-        if proto_txt:
-            fs_str = "fs_client_param {{\n{}}}"
-            return fs_str.format(proto_txt)
+    def __init__(self, fs_client_param):
+        self.fs_client_param = fs_client_param
+
+    def _set(self, proto):
+        if not text_format.MessageToString(self.fs_client_param):
+            return
+        proto.uri = self.fs_client_param.uri
+        proto.user = self.fs_client_param.user
+        proto.passwd = self.fs_client_param.passwd
+        proto.hadoop_bin = self.fs_client_param.hadoop_bin
+
+
+class PsDescBuilder(object):
+    def __init__(self, context):
+        self.context = context
+        self.is_sync = context['is_sync']
+        self.ps_mode = context['ps_mode']
+        self.is_heter_ps_mode = context['is_heter_ps_mode']
+        self.use_ps_gpu = context['use_ps_gpu']
+        self.barrier_table_id = None
+        self.send_ctx = get_the_one_send_context(
+            self.context,
+            use_origin_program=True,
+            split_dense_table=self.is_heter_ps_mode)
+
+        self.tensor_table_dict = {}  # TODO
+        self._server_sub_program = []
+
+        self.tables = self._get_tables()
+
+        self.service = self._get_service()
+        self.fs_client = self._get_fs_client()
+
+        self.ps_desc = ps_pb2.PSParameter()
+
+    def _get_tensor_tables(self):
+        program_idx = 0
+        if not self.tensor_table_dict:
+            self._server_sub_program.append(Program().desc)
+        tables = []
+        for table_name in self.tensor_table_dict:
+            tables.append(globals()['TensorTable'](len(tables), tensor_dict,
+                                                   self.context['role_maker']))
+            program_idx += 1
+        return tables
+
+    def _get_tables(self):
+        tables = []
+        for idx, (name, ctx) in enumerate(self.send_ctx.items()):
+            print('####### {}\n'.format(ctx.is_sparse()))
+            if ctx.is_sparse():
+                if self.ps_mode == DistributedMode.GEO:
+                    tables.append(globals()['GeoSparseTable'](self.context,
+                                                              ctx))
+                else:
+                    tables.append(globals()['SparseTable'](self.context, ctx))
+            else:
+                tables.append(globals()['DenseTable'](self.context, ctx))
+        self.tensor_tables = self._get_tensor_tables()
+        tables.extend(self.tensor_tables)
+        tables.append(globals()['BarrierTable'](self.context, len(tables)))
+        return tables
+
+    def _get_service(self):
+        if self.use_ps_gpu:
+            return GpuService()
         else:
-            return ""
+            return Service()
+
+    def _get_fs_client(self):
+        return fsClient(self.context["user_defined_strategy"].fs_client_param)
+
+    def build_worker_desc(self):
+        for table in self.tables:
+            table_proto = self.ps_desc.worker_param.downpour_worker_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            if type(table) == BarrierTable and self.barrier_table_id is None:
+                self.barrier_table_id = table.idx
+        self.service._set(
+            self.ps_desc.server_param.downpour_server_param.service_param)
+        return text_format.MessageToString(self.ps_desc)
+
+    def build_server_desc(self):
+        for table in self.tables:
+            table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            self.sparse_table_maps = {}
+            if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None:
+                self.sparse_table_maps[
+                    table_proto.common.table_name] = table_proto.table_id
+
+        self.service._set(
+            self.ps_desc.server_param.downpour_server_param.service_param)
+        self.fs_client._set(self.ps_desc.fs_client_param)
+        return text_format.MessageToString(self.ps_desc)
 
 
 class TheOnePSRuntime(RuntimeBase):
@@ -665,8 +807,11 @@ class TheOnePSRuntime(RuntimeBase):
         self.role_maker = context["role_maker"]
 
         self.origin_main_program = context["origin_main_program"]
-        self.origin_main_programs = context["origin_main_programs"]
-
+        self.origin_main_programs = context.get("origin_main_programs",
+                                                [self.origin_main_program])
+        self.context["origin_main_programs"] = self.origin_main_programs
+        self.context["origin_startup_programs"] = context.get(
+            'origin_startup_programs', [context['origin_startup_program']])
         self.context[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
         self.is_heter_ps_mode = self.context['is_heter_ps_mode']
@@ -675,15 +820,23 @@ class TheOnePSRuntime(RuntimeBase):
         self.context['ps_mode'] = self.context['trainer'].mode
         self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[
             'use_ps_gpu']
-        self.is_sync = True if self.context[
+        self.context['is_sync'] = True if self.context[
             'ps_mode'] == DistributedMode.SYNC else False
         self.context['grad_name_to_param_name'] = {}
         self.context['tensor_table'] = {}
         build_var_distributed(self.context)
 
+        self.endpoints = get_ps_endpoints(self.role_maker)
+        self.string_hosts = []
+        for idx, ep in enumerate(self.endpoints):
+            host, port = ep.split(":")
+            pshost = fluid.core.PSHost(host, int(port), idx)
+            self.string_hosts.append(pshost.serialize_to_string())
+
+        self.ps_desc_builder = PsDescBuilder(self.context)
+
     def _init_worker(self):
-        worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync)
-        server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync)
+        worker_desc = self.ps_desc_builder.build_worker_desc()
 
         if self.context['use_ps_gpu']:
             main_program = self.context['loss'].block.program
@@ -701,30 +854,18 @@ class TheOnePSRuntime(RuntimeBase):
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = str(worker) + "\n" + str(server)
-        with open('proto_txt', 'w') as f:
-            f.write(proto_txt)
-
+        proto_txt = worker_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-
         if debug:
             print("worker: \n{}".format(proto_txt))
 
-        endpoints = get_ps_endpoints(self.role_maker)
-
-        string_hosts = []
-        for idx, ep in enumerate(endpoints):
-            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
-            string_hosts.append(pshost.serialize_to_string())
-
         dense_map = get_the_one_recv_context(
             self.context, split_dense_table=self.is_heter_ps_mode)
         send_ctx = get_the_one_send_context(
             self.context,
             split_dense_table=self.is_heter_ps_mode,
             use_origin_program=self.is_heter_ps_mode,
-            ep_list=endpoints)
+            ep_list=self.endpoints)
         trainer_config = self.context['trainer']
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
@@ -741,10 +882,7 @@ class TheOnePSRuntime(RuntimeBase):
         kwargs["trainer_id"] = self.role_maker._role_id()
         kwargs["trainers"] = self.role_maker._worker_num()
 
-        for table in server.servers[0].tables:
-            if table.table_class == "BarrierTable":
-                kwargs["barrier_table_id"] = table.id
-                break
+        kwargs["barrier_table_id"] = self.ps_desc_builder.barrier_table_id
 
         if self.context['ps_mode'] == DistributedMode.SYNC:
             sync_kwargs = sync_strategy_envs()
@@ -755,7 +893,8 @@ class TheOnePSRuntime(RuntimeBase):
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
         self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
-                                         string_hosts, fluid.global_scope())
+                                         self.string_hosts,
+                                         fluid.global_scope())
 
         fleet.util.barrier()
         info = self._communicator.get_client_info()
@@ -812,275 +951,16 @@ class TheOnePSRuntime(RuntimeBase):
                                                  previous_trainers,
                                                  self.role_maker._role_id())
 
-    def _push_sparse_param(self,
-                           var_name,
-                           table_id=-1,
-                           scope=fluid.global_scope()):
-        self._communicator.push_sparse_param(var_name, table_id, scope)
-
-    def _get_executor(self):
-        executor = fluid.Executor(fluid.CPUPlace())
-        if self.is_heter_ps_mode:
-            if self.role_maker._is_heter_worker():
-                heter_device_type = self.role_maker._heter_device_type().upper()
-                if heter_device_type not in ["GPU", "XPU", "CPU"]:
-                    raise ValueError("Heter Worker Not Support Device {}".
-                                     format(device_type))
-                if heter_device_type == "GPU":
-                    executor = Executor(
-                        fluid.CUDAPlace(
-                            int(os.getenv("FLAGS_selected_gpus", "0"))))
-                elif heter_device_type == "XPU":
-                    executor = Executor(
-                        fluid.XPUPlace(
-                            int(os.getenv("FLAGS_selected_xpus", "0"))))
-        return executor
-
-    def _get_fleet_proto(self, is_server, is_sync, **kwargs):
-        def _build_merge_accessor(ctx):
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-
-            if ctx.is_sparse():
-                accessor.feature_dim = ctx.sections()[0]
-                accessor.embedding_dim = ctx.sections()[1]
-            else:
-                accessor.feature_dim = ctx.sections()[0]
-                accessor.embedding_dim = 1
-
-            return accessor
-
-        def _build_barrier_table(idx):
-            table = Table()
-            table.id = idx
-            table.type = "PS_OTHER_TABLE"
-            table.table_class = "BarrierTable"
-            table.shard_num = 256
-
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-            accessor.feature_dim = 0
-            accessor.embedding_dim = 0
-            table.accessor = accessor
-
-            common = CommonAccessor()
-            common.table_name = "barrier_table"
-            trainer_num = get_trainers(self.context['role_maker'])
-            if self.is_heter_ps_mode:
-                trainer_num += len(self.role_maker._get_heter_worker_endpoints(
-                ))
-            common.trainer_num = trainer_num
-            common.attrs = ""
-            common.dims = []
-            common.params = []
-            table.common = common
-            return table
-
-        def _build_tensor_table(idx, tensor_dict):
-            table = Table()
-            table.id = idx
-            table.type = "PS_OTHER_TABLE"
-            table.table_class = tensor_dict["tensor_table_class"]
-            table.shard_num = 256
-
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-            accessor.feature_dim = 0
-            accessor.embedding_dim = 0
-            table.accessor = accessor
-
-            common = CommonAccessor()
-            common.table_name = tensor_dict["feed_var_name"]
-            common.trainer_num = get_trainers(self.role_maker)
-            common.attrs = ""
-            common.dims = []
-            common.params = []
-            table.common = common
-
-            tensor = Tensor()
-            tensor.main_program_id = tensor_dict["main_program_id"]
-            tensor.startup_program_id = tensor_dict["startup_program_id"]
-            tensor.feed_var_name = tensor_dict["feed_var_name"]
-            tensor.fetch_var_name = tensor_dict["fetch_var_name"]
-            tensor.tensor_table_class = tensor_dict["tensor_table_class"]
-            table.tensor = tensor
-
-            return table
-
-        def _add_tensor_table(tables):
-            tensor_table_dict = {}
-            program_idx = 0
-            for table_name in tensor_table_dict:
-                if tensor_table_dict[table_name]["startup_program"] != None:
-                    tensor_table_dict[table_name][
-                        "startup_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["startup_program"].desc)
-                    program_idx += 1
-                if tensor_table_dict[table_name]["main_program"] != None:
-                    tensor_table_dict[table_name][
-                        "main_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["main_program"].desc)
-                    program_idx += 1
-                # Todo: Hard code for lr_decay table apply table id
-                new_table = _build_tensor_table(
-                    len(tables), tensor_table_dict[table_name])
-                tables.append(new_table)
-            return tables
-
-        def _get_tables():
-            send_ctx = get_the_one_send_context(
-                self.context,
-                use_origin_program=True,
-                split_dense_table=self.is_heter_ps_mode)
-
-            tables = []
-            for idx, (name, ctx) in enumerate(send_ctx.items()):
-                print(" wxm python test send_ctx.items-->", idx, (name, ctx))
-                if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1:
-                    continue
-
-                table = Table()
-                table.id = ctx.table_id()
-                common = CommonAccessor()
-
-                if ctx.is_sparse():
-                    table.type = "PS_SPARSE_TABLE"
-                    table.shard_num = 256
-
-                    common.table_name = self.context['grad_name_to_param_name'][
-                        ctx.origin_varnames()[0]]
-
-                    if self.context['ps_mode'] == DistributedMode.GEO:
-                        table.table_class = "SparseGeoTable"
-                    else:
-                        all_table_proto = self.context[
-                            "user_defined_strategy"].sparse_table_configs
-                        table_proto = all_table_proto.add()
-                        for proto in all_table_proto:
-                            if proto.table_name == common.table_name:
-                                table_proto = proto
-                                break
-                        if table_proto.HasField("table_class"):
-                            table.table_class = table_proto.table_class
-                        else:
-                            table.table_class = parse_table_class(
-                                common.table_name,
-                                ctx.program_id(), self.context)
-                        if table.table_class != 'MemorySparseTable':
-                            table.table_class = 'MemorySparseTable'
-                            warnings.warn(
-                                "The PS mode must use MemorySparseTable.")
-
-                        if table_proto.HasField("shard_num"):
-                            table.shard_num = table_proto.shard_num
-                        else:
-                            table.shard_num = 1000
-                            warnings.warn(
-                                "The shard_num of sparse table is not set, use default value 1000."
-                            )
-
-                        if table_proto.accessor.ByteSize() == 0:
-                            warnings.warn(
-                                "The accessor of sparse table is not set, use default value."
-                            )
-                        get_default_accessor_proto(
-                            table_proto.accessor, common.table_name,
-                            ctx.program_id(), self.context)
-                        check_embedding_dim(table_proto.accessor,
-                                            common.table_name,
-                                            ctx.program_id(), self.context)
-                        table.accessor_proto = text_format.MessageToString(
-                            table_proto.accessor)
-                else:
-                    table.type = "PS_DENSE_TABLE"
-                    table.table_class = "CommonDenseTable"
-                    table.shard_num = 256
-                    common.table_name = "MergedDense"
-
-                adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
-                common.parse_by_optimizer(ctx, self.context)
-
-                if ctx.is_sparse():
-                    common.parse_entry(common.table_name,
-                                       ctx.program_id(), self.context)
-
-                if is_sync:
-                    common.sync = "true"
-                else:
-                    common.sync = "false"
-                table.common = common
-
-                if table.table_class != 'MemorySparseTable':
-                    accessor = _build_merge_accessor(ctx)
-                    table.accessor = accessor
-                tables.append(table)
-
-            tensor_table_dict = {}
-            if len(tensor_table_dict) > 0:
-                tables = _add_tensor_table(tables)
-            else:
-                empty_porgram = Program()
-                self._server_sub_program.append(empty_porgram.desc)
-
-            barrier_table = _build_barrier_table(len(tables))
-            tables.append(barrier_table)
-            return tables
-
-        if is_server:
-            server = Server()
-            downpour_server = DownpourServer()
-
-            service = Service()
-            dist_strategy = self.context["valid_strategy"]
-            use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
-            if use_ps_gpu:
-                service.server_class = "PsLocalServer"
-                service.client_class = "PsLocalClient"
-            downpour_server.set_service_param(service)
-
-            tables = _get_tables()
-            downpour_server.tables = tables
-            server.add_server(downpour_server)
-            return server
-        else:
-            worker = Worker()
-            downpour_worker = DownpourWorker()
-
-            tables = _get_tables()
-            downpour_worker.tables = tables
-            worker.add_worker(downpour_worker)
-            return worker
-
     def _init_server(self, dirname=None, var_names=None, **kwargs):
+        server_desc = self.ps_desc_builder.build_server_desc()
         role_id = get_role_id(self.role_maker)
-        endpoints = get_ps_endpoints(self.role_maker)
         trainers = get_trainers(self.role_maker)
         if self.is_heter_ps_mode:
             trainers += len(self.role_maker._get_heter_worker_endpoints())
-        server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync)
-        proto_txt = str(server)
-        fs_client = fsClient(self.context["user_defined_strategy"]
-                             .fs_client_param)
-        proto_txt = proto_txt + "\n" + fs_client.to_string()
-
-        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-        if debug:
-            print("server: \n{}".format(proto_txt))
-
-        string_hosts = []
-        for idx, ep in enumerate(endpoints):
-            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
-            string_hosts.append(pshost.serialize_to_string())
 
         self._server = fluid.core.DistFleetWrapper()
-        self._server.init_server(proto_txt, string_hosts, role_id, trainers,
-                                 self._server_sub_program)
+        self._server.init_server(server_desc, self.string_hosts, role_id,
+                                 trainers, self._server_sub_program)
 
         dist_varnames = get_sparse_tablenames(self.origin_main_programs, True)
         sparse_varnames = get_sparse_tablenames(self.origin_main_programs,
@@ -1101,10 +981,7 @@ class TheOnePSRuntime(RuntimeBase):
         if dirname is None or not load_varnames:
             return
 
-        sparse_table_maps = {}
-        for table in server.servers[0].tables:
-            if table.type == "PS_SPARSE_TABLE" and table.common is not None:
-                sparse_table_maps[table.common.table_name] = table.id
+        sparse_table_maps = self.ps_desc_builder.sparse_table_maps
 
         dirname = os.path.normpath(dirname)
         pserver_id = self.role_maker._role_id()
@@ -1135,7 +1012,7 @@ class TheOnePSRuntime(RuntimeBase):
             if origin_varname.endswith("@GRAD"):
                 return False
 
-            if origin_varname == "learning_rate_0":
+            if origin_varname.startswith("learning_rate_"):
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
@@ -1186,7 +1063,7 @@ class TheOnePSRuntime(RuntimeBase):
         sparses = get_the_one_recv_context(
             self.context,
             is_dense=False,
-            split_dense_table=self.is_heter_ps_mod,
+            split_dense_table=self.is_heter_ps_mode,
             use_origin_program=True)
 
         sparse_varnames = self._save_sparse_params(executor, dirname, sparses,
@@ -1239,7 +1116,7 @@ class TheOnePSRuntime(RuntimeBase):
                 "in fleet.save() function, executor must be as Executor type")
 
         if main_program is None:
-            main_program = self.context['origin_ps_main_program']
+            main_program = self.context['origin_main_program']
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
@@ -1413,7 +1290,7 @@ class TheOnePSRuntime(RuntimeBase):
 
         fleet.util.barrier()
         if self.role_maker._is_first_worker():
-            sparses = sget_the_one_recv_context(
+            sparses = get_the_one_recv_context(
                 self.context,
                 is_dense=False,
                 split_dense_table=self.role_maker.
diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py
index 1a426f3ad6c6af137ebb644c9662910d7c883d04..701ae8be6cb9ce943698b8ec714907fe1a198854 100755
--- a/python/paddle/distributed/ps/utils/ps_factory.py
+++ b/python/paddle/distributed/ps/utils/ps_factory.py
@@ -38,5 +38,7 @@ class PsProgramBuilderFactory(object):
         elif 'is_fl_ps_mode' in attrs and attrs[
                 'is_fl_ps_mode'] == DistributedMode.FL:
             return globals()['FlPsProgramBuilder'](pass_ctx)
-        else:
+        elif attrs['ps_mode'] == DistributedMode.SYNC:
             return globals()['CpuSyncPsProgramBuilder'](pass_ctx)
+        else:
+            return globals()['CpuAsyncPsProgramBuilder'](pass_ctx)
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index 25e4dc28bdcb8fbc7df356cca7e6f6aa8b721623..ff99f9d071e2f4321df83a30bded1fb1678355d3 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -88,24 +88,28 @@ class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
         self.attrs['origin_main_program'] = self.cloned_main
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
 
 class CpuSyncPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
-        logger.info("start building cpu-sync-ps program")
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
+        if self.ps_mode == DistributedMode.SYNC:
+            logger.info("start building cpu-sync-ps program")
         if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
             raise ValueError("ps mode: {} not matched {}",
-                             format(self.ps_mode, "CpuSyncPsProgramBuilder"))
+                             format(self.ps_mode, "PsProgramBuilder"))
 
     def _build_trainer_programs(self):
+        print("build trainer program entry")
+        print("before ps program builder program:", self.cloned_main)
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                            self.attrs)
         add_lr_decay_table_pass.apply([], [], self.pass_ctx)
 
+        print("before distributed op pass")
         distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
         distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
 
@@ -125,9 +129,10 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
 
         self.attrs['origin_main_program'] = self.cloned_main
         self.attrs['origin_startup_program'] = self.cloned_startup
+        print("after ps program builder program:", self.cloned_main)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -166,7 +171,7 @@ class GpuPsProgramBuilder(PsProgramBuilder):
         self.attrs['origin_startup_program'] = self.cloned_startup
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -219,7 +224,7 @@ class HeterAsyncPsProgramBuilder(PsProgramBuilder):
                                           [self.cloned_startup], self.pass_ctx)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ebec6900e38f5f7ee402513211eeebb13bb6aeb4..7839c8520c68ff16497945f49fed99f2a1d9018e 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -73,7 +73,9 @@ def logger_config(log_path, logging_name):
     return logger
 
 
-logger = logger_config(log_path='/ps_log', logging_name='ps_log')
+ps_log_root_dir = '/ps_log/'
+logger = logger_config(
+    log_path='/ps_usr_print_log', logging_name='ps_usr_print_log')
 
 
 class DistributedMode:
@@ -448,9 +450,8 @@ def get_the_one_send_context(context,
     idx = 0
     for i, program in enumerate(origin_programs):
         merged_dense_pairs = context['merged_dense_pairs'][i]
-        idx += get_dense_send_context(program, send_ctx, idx,
-                                      merged_dense_pairs, trainer_id,
-                                      split_dense_table)
+        idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
+                                     trainer_id, split_dense_table)
     distibuted_varnames = get_sparse_tablenames(origin_programs, True)
     print("public distibuted_varnames:", distibuted_varnames)
     for i, program in enumerate(origin_programs):
diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d14e3dd099ffe2e23e4d1d7ca3195e6a06ee3dce
--- /dev/null
+++ b/python/paddle/distributed/sharding/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .group_sharded import group_sharded_parallel, save_group_sharded_model  # noqa: F401
+
+__all__ = ['group_sharded_parallel', 'save_group_sharded_model']
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fdb20600f673b21e7cabd6ffe35c545b045bb5d
--- /dev/null
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from enum import Enum
+
+import paddle
+
+from paddle.optimizer import Optimizer
+from paddle.distributed.utils import get_logger
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+logger_ = get_logger(logging.INFO)
+
+
+def group_sharded_parallel(model,
+                           optimizer,
+                           level,
+                           scaler=None,
+                           group=None,
+                           offload=False,
+                           sync_buffers=False,
+                           buffer_max_size=2**23,
+                           segment_size=2**20,
+                           sync_comm=False):
+    """
+    Use this module to configure and wrap up the parameters of the group shared module.
+
+    Args:
+        model (Layer): The layer to be wrapped with group_sharded_parallel.
+        optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
+        level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
+        scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None.
+        group (Group, optional): The group instance. Defaults to None.d
+        offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False.
+        sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False.
+        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23.
+        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20.
+        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False.
+    
+    Returns:
+        model: A wrapper for group sharded given model.
+        optimizer: A wrapper for group sharded given optimizer.
+        scaler: A wrapper for group sharded given scaler.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+    """
+    # check optition type
+    assert isinstance(
+        model,
+        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+    assert isinstance(
+        optimizer, Optimizer
+    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
+    assert level in ['os', 'os_g', 'p_g_os'
+                     ], "The level must be os, os_g or p_g_os."
+
+    def check_dtype(param):
+        return param.dtype == paddle.float16
+
+    params_fp16 = filter(check_dtype, model.parameters())
+    if scaler is None and len(params_fp16) > 0:
+        raise ValueError("Please enter the correct scaler.")
+    # convert model/optimizer/scaler
+    if level in ['os', 'os_g']:
+        logger_.info("*" * 30)
+        logger_.info("Sharded level os uses sharded level os_g achieved now.")
+        logger_.info("*" * 30)
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(),
+            optim=optimizer,
+            group=group,
+            offload=offload)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            buffer_max_size=buffer_max_size)
+    elif level == 'p_g_os':
+        model = ShardingStage3(
+            model,
+            optimizer=optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            segment_size=segment_size,
+            offload=offload,
+            sync_comm=sync_comm)
+    else:
+        raise ValueError("Please enter the correct level.")
+    if params_fp16 and isinstance(scaler, paddle.amp.GradScaler):
+        scaler = ShardingScaler(scaler)
+    logger_.info("*" * 30)
+    logger_.info(
+        "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
+    )
+    logger_.info("*" * 30)
+
+    return model, optimizer, scaler
+
+
+def save_group_sharded_model(model, output, optimizer=None):
+    """
+    Group sharded encapsulated model and optimizer state saving module.
+
+    Args:
+        model (Layer): A wrapper for group sharded given model.
+        output (str): Save directory.
+        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # save model and optimizer state_dict
+            save_group_sharded_model(model, optimizer，output=output_dir)
+    """
+    logger_.info(
+        "==========Begin to save group sharded model and optimizer==========")
+    assert not os.path.isfile(
+        output
+    ), "Saving directory ({}) should be a directory, not a file".format(output)
+    os.makedirs(output, exist_ok=True)
+    output_model = os.path.join(output, "model.pdmodel")
+    if isinstance(model, ShardingStage2):
+        paddle.save(model._layer.state_dict(), output_model)
+    elif isinstance(model, ShardingStage3):
+        convert2cpu = True if model._offload else False
+        model.get_all_parameters(convert2cpu=convert2cpu)
+        paddle.save(model._layer.state_dict(), output_model)
+    else:
+        raise ValueError(
+            "Please use the layer which is wrapped with group_sharded_parallel.")
+
+    if optimizer is not None:
+        assert hasattr(
+            optimizer, "_optim"
+        ), "Please use the optimizer which is wrapped with group_sharded_parallel."
+        output_opt = os.path.join(output, "model.pdopt")
+        paddle.save(optimizer._optim.state_dict(), output_opt)
+    logger_.info(
+        "==========End to save group sharded model and optimizer==========")
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 53f4a93f6480e84c2df7138e0d71d1832f7360d1..ae40a42e9d5074d112ebfcbf87ce3f272928d637 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -546,13 +546,15 @@ class Pod(object):
 
 def get_logger(log_level, name="root"):
     logger = logging.getLogger(name)
-    logger.setLevel(log_level)
-
-    log_handler = logging.StreamHandler()
-    log_format = logging.Formatter(
-        '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
-    log_handler.setFormatter(log_format)
-    logger.addHandler(log_handler)
+    # Avoid printing multiple logs
+    if not logger.handlers:
+        logger.setLevel(log_level)
+
+        log_handler = logging.StreamHandler()
+        log_format = logging.Formatter(
+            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
 
     return logger
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 9da798375af258633ea346d016fe99591f88e32d..97b4116826a2a0e809af4a4129a0e953fd607b07 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -272,7 +272,7 @@ class PostTrainingQuantization(object):
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
         self._support_algo_type = [
-            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+            'KL', 'hist', 'avg', 'mse', 'emd', 'abs_max', 'min_max'
         ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
@@ -349,7 +349,7 @@ class PostTrainingQuantization(object):
         # The vars for algo = avg
         self._quantized_var_avg = {}
         # The best loss of algo = mse
-        self._best_mse_loss = {}
+        self._best_calibration_loss = {}
         # The threshold for algo = abs_max, mse or avg
         self._quantized_threshold = {}
 
@@ -408,7 +408,7 @@ class PostTrainingQuantization(object):
                 np.array(self._quantized_var_avg[var_name]).mean()
         if self._algo in ["KL", "hist"]:
             self._calculate_kl_hist_threshold()
-        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse", "emd"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -582,6 +582,8 @@ class PostTrainingQuantization(object):
             self._sample_min_max()
         elif self._algo == "mse":
             self._sample_mse()
+        elif self._algo == "emd":
+            self._sample_emd()
         elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
@@ -610,8 +612,8 @@ class PostTrainingQuantization(object):
             abs_max_value = float(np.max(np.abs(var_tensor)))
             abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
             s = 0.3
-            if var_name not in self._best_mse_loss:
-                self._best_mse_loss[var_name] = float('inf')
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
             while s <= 1.0:
                 scale = s * abs_max_value
                 s += 0.02
@@ -620,8 +622,49 @@ class PostTrainingQuantization(object):
                     np.clip(var_tensor, 0.0, scale) / scale *
                     bins) / bins * scale
                 mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
-                if mse_loss <= self._best_mse_loss[var_name]:
-                    self._best_mse_loss[var_name] = mse_loss
+                if mse_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_emd(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("EMD searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
+            s = 0.3
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                emd_loss = np.abs(
+                    np.mean(var_tensor) - np.mean(quant_dequant_var)) + np.abs(
+                        np.std(var_tensor) - np.std(quant_dequant_var))
+                if emd_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = emd_loss
                     self._quantized_threshold[var_name] = scale
 
     def _sample_avg(self):
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index d5bc2e6b5307bf477c928380070644aca3c67f62..9d9fbd39a5767ffe72ad579df2d31ac66eda2234 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object):
         graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass')
-        graph = self._apply_pass(graph,
-                                 'conv_eltwiseadd_affine_channel_fuse_pass')
         graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
         graph = self._apply_pass(graph,
                                  'conv_transpose_eltwiseadd_bn_fuse_pass')
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 494ea9697971974d20c917006225df55f531ff70..49ae8f5fd56d375dd7d1b4b4d772892b7724b9b6 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -25,6 +25,12 @@ function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_pa
     _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True)
 endfunction()
 
+function(download_data install_dir url data_file check_sum)
+    if (NOT EXISTS ${install_dir}/${data_file})
+	    inference_download_and_uncompress(${install_dir} ${url} ${data_file} ${check_sum})
+    endif()
+endfunction()
+
 function(download_quant_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
@@ -290,8 +296,9 @@ if(LINUX AND WITH_MKLDNN)
 	### PTQ INT8
 
 	# PTQ int8 lstm model
-	set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz")
-	download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7)
+	set(LSTM_DATA_FILE "quant_lstm_input_data.tar.gz")
+	set(LSTM_URL "${INFERENCE_URL}/int8/unittest_model_data")
+	download_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_URL} ${LSTM_DATA_FILE} add84c754e9b792fea1fbd728d134ab7)
 	set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
 	download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743)
 	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
@@ -344,10 +351,12 @@ endif()
 
 set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
 set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
-set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
+set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
+
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
     set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index c4318b8bf8ef629d4bcba1f43be350298beff7da..7b9cd7958b2d3d0704a3770156d86f9cae44592a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -26,7 +26,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
+from paddle.fluid.framework import IrGraph, _test_eager_guard
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
         seed = 1000
         lr = 0.001
 
@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
-    def test_save_quantized_model(self):
+    def func_save_quantized_model(self):
         lr = 0.001
 
         load_param_path = "test_save_quantized_model/lenet.pdparams"
@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_save_quantized_model(self):
+        with _test_eager_guard():
+            self.func_save_quantized_model()
+        self.func_save_quantized_model()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
index fb92b12cb0d870d185b4c31a7bcdb1bebfe5b38d..fad4c8f9d580b389b861c4c9a992af0c48cd892d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -29,6 +29,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import *
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
 
 from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
 from imperative_test_utils import ImperativeLinearBn_hook
@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase):
                 break
         return top1_correct_num / total_num
 
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQfuse(TestImperativePTQ):
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQHist(TestImperativePTQ):
     def set_vars(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 677ccb52e242cf0c95a7b03acaefbf4d424ac401..5db720b028ffec8c1c02731a44c0a4466e827cc8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 paddle.enable_static()
@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase):
         self.activation_quantize_type = 'moving_average_abs_max'
         print('weight_quantize_type', self.weight_quantize_type)
 
-    def test_qat(self):
+    def func_qat(self):
         self.set_vars()
 
         imperative_qat = ImperativeQuantAware(
@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase):
                 np.allclose(after_save, before_save.numpy()),
                 msg='Failed to save the inference quantized model.')
 
+    def test_qat(self):
+        with _test_eager_guard():
+            self.func_qat()
+        self.func_qat()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index d1bf76f472465f2803cd95ad1dc468bbc5289051..2dcf7a6f168e20393a3e1a3432ac75b652e2a063 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -27,7 +27,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 os.environ["CPU_NUM"] = "1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 270e8ee566ab57c4e244e1bbb52a8c3d9a41db52..0bc80694a12cb6a9ff2f857444a06e7764038e36 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D
 from paddle.fluid.dygraph import Linear
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
-
+from paddle.fluid.framework import _test_eager_guard
 os.environ["CPU_NUM"] = "1"
 
 _logger = get_logger(
@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
         _logger.info("test act_preprocess")
         self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
 
-    def test_quant_aware_training(self):
+    def func_quant_aware_training(self):
         imperative_qat = self.imperative_qat
         seed = 1
         np.random.seed(seed)
@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
         train(lenet)
         test(lenet)
 
+    def test_quant_aware_training(self):
+        with _test_eager_guard():
+            self.func_quant_aware_training()
+        self.func_quant_aware_training()
+
 
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
     def setUp(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 8d2e0f753c0167295dd18c0266b613e963de5d8a..d77134d72a9596956f063612bc16e6a37f81037a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+from paddle.fluid.framework import _test_eager_guard
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
@@ -42,7 +43,8 @@ _logger = get_logger(
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
+        paddle.disable_static()
         seed = 1000
         lr = 0.1
 
@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase):
         if find_matmul:
             self.assertTrue(matmul_skip_count == 1)
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index da5c5d6dc9441bbddd5104a1b9b2aa94880fda21..4b70f5b103778baa631cb55e4277fa1bc0fdca65 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -244,6 +244,26 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
                       quant_iterations)
 
 
+class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "emd"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
     def test_post_training_avg(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 71611048610060df736dd11bc6da7f6816cfd4b6..f83306aca1dc018838cf10f7084141682c8fff38 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -394,5 +394,27 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
                       diff_threshold)
 
 
+class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "emd"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 937fcdf0463beed7d9116be1a4800a0d02238e7d..ffa12ac70460084fd49a14d0193be6e913495b9a 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -155,8 +155,7 @@ def prune_model(main_program=None,
                 n=2,
                 m=4,
                 mask_algo='mask_1d',
-                with_mask=True,
-                sharding=False):
+                with_mask=True):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
     specified mask generation function given by :attr:`mask_algo`. This 
@@ -179,7 +178,6 @@ def prune_model(main_program=None,
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
-        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -221,7 +219,10 @@ def prune_model(main_program=None,
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
             sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
-    if sharding:
+    if main_program is not None and hasattr(
+            main_program,
+            "distributed_info_") and main_program.distributed_info_[
+                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
         gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
         place = paddle.CUDAPlace(gpu_id)
     else:
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 5e023e9248cab5cc00b7912f23188f58ccbc8eb0..617ab6305289f9b07a8131b54a8d588abd42a911 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -283,6 +283,7 @@ if avx_supported():
         from .core_avx import _set_cached_executor_build_strategy
         from .core_avx import _device_synchronize
         from .core_avx import _get_current_stream
+        from .core_avx import _Profiler, _ProfilerResult, _RecordEvent
         from .core_avx import _set_current_stream
         if sys.platform != 'win32':
             from .core_avx import _set_process_pids
@@ -344,6 +345,7 @@ if load_noavx:
         from .core_noavx import _device_synchronize
         from .core_noavx import _get_current_stream
         from .core_noavx import _set_current_stream
+        from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent
         if sys.platform != 'win32':
             from .core_noavx import _set_process_pids
             from .core_noavx import _erase_process_pids
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 3debeecfe4f38209ea9080a1e9899d632ee55bfe..3a23c852563daacb54ebbf2fa642b9126e53fb3b 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -113,8 +113,6 @@ class BatchSampler(Sampler):
             assert not shuffle, "shuffle should be False when sampler is set"
             self.sampler = sampler
         else:
-            assert isinstance(dataset, Dataset), \
-                "dataset should be a paddle.io.Dataset"
             assert not isinstance(dataset, IterableDataset), \
                 "dataset should not be a paddle.io.IterableDataset"
             assert sampler is None, \
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f43a51063b00ac0439aacfbf46ff593e7b1b4f43..191661b7bf9d5a5b2877f22ebe6d2ec9124f2f96 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -411,9 +411,9 @@ def amp_decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -426,7 +426,7 @@ def amp_decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8c2ff140ea4d5531a0ab6e284b1661573d9a2670..8149d69d36a27fadcefa8dc6b6ff1dd89792e29e 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -99,18 +99,19 @@ def param_guard(parameters):
         yield
 
 
-def _convert_into_variable(var_base):
+def _convert_into_variable(tensor):
     """
     Convert Varbase into Variable.
     """
-    if isinstance(var_base, core.VarBase):
+    if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
         # Check whether has been created before.
-        new_var = var_base.block._find_var_recursive(var_base.name)
+        new_var = tensor.block._find_var_recursive(tensor.name)
         if new_var is not None:
             assert isinstance(new_var, framework.Variable)
         # Convert ParamBase into Parameter with same attributes in dy2stat.
-        elif isinstance(var_base, framework.ParamBase):
-            new_var = var_base._to_static_var(to_parameter=True)
+        elif isinstance(tensor,
+                        (framework.EagerParamBase, framework.ParamBase)):
+            new_var = tensor._to_static_var(to_parameter=True)
         else:
             # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
             # same attributes and set persistable=True to allow saving this var.
@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
 
             # But if its shape is empty while created from `create_variable()`, we consider this buffer
             # non-persistable. See case of `drop_state` in lstm api.
-            is_persistable = len(var_base.shape) > 0
+            is_persistable = len(tensor.shape) > 0
 
-            new_var = var_base._to_static_var(
+            new_var = tensor._to_static_var(
                 to_parameter=False, persistable=is_persistable)
         return new_var
     else:
-        return var_base
+        return tensor
 
 
 def enabled():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 94fc5558ab162636e59a5569904d770970f812d1..a442a8b92b6f7cf6c5c366e63ace110e9fb94e01 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -61,7 +61,8 @@ class NestSequence(object):
     def _get_var_ids(self):
         var_ids = []
         for idx, var in enumerate(self.__input_list):
-            if isinstance(var, (framework.Variable, core.VarBase)):
+            if isinstance(var, (framework.Variable, core.VarBase,
+                                core.eager.Tensor)):
                 var_ids.append(idx)
 
         return var_ids
@@ -73,7 +74,8 @@ class NestSequence(object):
         if need_check:
             warning_types = set()
             for var in self.__input_list:
-                if not isinstance(var, (framework.Variable, core.VarBase)):
+                if not isinstance(var, (framework.Variable, core.VarBase,
+                                        core.eager.Tensor)):
                     warning_types.add(type(var))
             if warning_types:
                 logging_utils.warn(
@@ -301,10 +303,17 @@ class PartialProgramLayer:
             for name in block.vars:
                 if "@GRAD" in name:
                     var_desc = block.vars[name].desc
-                    var_base = core.VarBase(var_desc.dtype(),
-                                            var_desc.shape(),
-                                            var_desc.name(),
-                                            var_desc.type(), False)
+                    var_base = None
+                    if not core._in_eager_mode():
+                        var_base = core.VarBase(var_desc.dtype(),
+                                                var_desc.shape(),
+                                                var_desc.name(),
+                                                var_desc.type(), False)
+                    else:
+                        var_base = core.eager.Tensor(var_desc.dtype(),
+                                                     var_desc.shape(),
+                                                     var_desc.name(),
+                                                     var_desc.type(), False)
                     double_grads.append(var_base)
         return self._valid_vars(double_grads)
 
@@ -386,13 +395,22 @@ class PartialProgramLayer:
         expected_place = framework._current_expected_place()
         for i, value in enumerate(flatten_inputs):
             if isinstance(value, np.ndarray):
-                var = core.VarBase(
-                    value=value,
-                    name=self._inputs[i].desc.name(),
-                    persistable=False,
-                    place=expected_place,
-                    zero_copy=True)
-            elif isinstance(value, core.VarBase):
+                var = None
+                if not core._in_eager_mode():
+                    var = core.VarBase(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+                else:
+                    var = core.eager.Tensor(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+            elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                 # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                 # into CUDAPlace when it's as input of multi Ops. so we move it in advance
                 # to avoid this problem.
@@ -411,9 +429,16 @@ class PartialProgramLayer:
             var = self._outputs[var_id]
             assert isinstance(var, framework.Variable)
             var_desc = var.desc
-            var_base = core.VarBase(var_desc.dtype(),
-                                    var_desc.shape(),
-                                    var_desc.name(), var_desc.type(), False)
+            varbase = None
+            if not core._in_eager_mode():
+                var_base = core.VarBase(var_desc.dtype(),
+                                        var_desc.shape(),
+                                        var_desc.name(), var_desc.type(), False)
+            else:
+                var_base = core.eager.Tensor(var_desc.dtype(),
+                                             var_desc.shape(),
+                                             var_desc.name(),
+                                             var_desc.type(), False)
             return var_base
 
         # Create VarBase to receive output data.
@@ -423,12 +448,19 @@ class PartialProgramLayer:
 
     def _create_scope_vec(self):
         # Hold forward variables
-        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                     "program_out_scope",
-                                     core.VarDesc.VarType.STEP_SCOPES, True)
-
-        inner_scope = core.Scope()
-        tmp_scope_vec.value().set_scope(inner_scope)
+        tmp_scope_vec = None
+        if not core._in_eager_mode():
+            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                         "program_out_scope",
+                                         core.VarDesc.VarType.STEP_SCOPES, True)
+            # TODO(jiabin): Support this later.
+            # else:
+            #     tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
+            #                                 "program_out_scope",
+            #                                 core.VarDesc.VarType.STEP_SCOPES, True)
+
+            inner_scope = core.Scope()
+            tmp_scope_vec.value().set_scope(inner_scope)
         return tmp_scope_vec
 
     def _restore_out(self, out_vars):
@@ -450,7 +482,8 @@ class PartialProgramLayer:
         return main_program.clone(for_test=True)
 
     def _is_no_value(self, var):
-        if isinstance(var, core.VarBase) and var.shape == [1]:
+        if isinstance(var,
+                      (core.VarBase, core.eager.Tensor)) and var.shape == [1]:
             # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
             if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
                 return True
@@ -460,7 +493,7 @@ class PartialProgramLayer:
         """
         Removes invalid value for various-length return statement
         """
-        if isinstance(out_vars, core.VarBase):
+        if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
             if self._is_no_value(out_vars):
                 return None
             return out_vars
@@ -527,7 +560,7 @@ class PartialProgramLayer:
         param_and_buffer_names_set = set()
         for i, var in enumerate(self._params):
             # self._params constains parameters and buffers with persistable=True.
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                 raise TypeError(
                     'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
                     format(i, type(var)))
@@ -559,10 +592,18 @@ def _create_fake_var():
     """
     Create a fake_var (force on CPU) to handle empty input or output
     """
-    return [
-        core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
-                     core.VarDesc.VarType.RAW, False)
-    ]
+    if not core._in_eager_mode():
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return []
+        # TODO(jiabin): Support this later
+        # return [
+        #     core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+        #                 core.VarDesc.VarType.RAW, False)
+        # ]
 
 
 def partial_program_from(concrete_program):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 04474dcdfe5091b2986eeeedb9870c00d83970db..d440e387da597d20300e50d7862dab80fb161c2a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -191,7 +191,7 @@ def is_api_in_module(node, module_prefix):
 
         return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
                                                                 module_prefix))
-    except NameError:
+    except Exception:
         return False
 
 
@@ -227,7 +227,7 @@ def is_numpy_api(node):
         # TODO: find a better way
         if not module_result:
             return func_str.startswith("numpy.") or func_str.startswith("np.")
-    except NameError:
+    except Exception:
         return False
 
 
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index aad7737350961b34badb89b0b8f542d0037b8c98..f58952d3036c506341955eff2472079bb696bb1f 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -885,7 +885,7 @@ def _run_dygraph(instance, input, program_holder):
             'start_op_index': 0,
             'end_op_index': end_op_index,
             'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program)
+            'program_id': _hash_with_id(trace_program, instance)
         })
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4bfdc3c27fad628bba3fd16237c12d3ca43244d7..b1865691b2475c4f855f51244e627965047d7720 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -25,7 +25,7 @@ import threading
 
 import six
 import paddle
-from paddle.fluid import core
+from paddle.fluid import core, dygraph
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten, pack_sequence_as
@@ -898,30 +898,33 @@ def save(layer, path, input_spec=None, **configs):
                 state_var_dict[var.name] = var
 
             # 3. share parameters from Layer to scope & record var info
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
-                    scr_tensor = param_or_buffer.value().get_map_tensor()
-                    tgt_var = scope.var(param_or_buffer.name)
-                    tgt_var.set_vocab(scr_tensor)
-                else:
-                    param_or_buffer_tensor = scope.var(
-                        param_or_buffer.name).get_tensor()
-                    #src_tensor = param_or_buffer.value().get_tensor()
-                    src_tensor = state_var_dict[param_or_buffer.name].value(
-                    ).get_tensor()
-                    param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                if param_or_buffer.name not in extra_var_info:
-                    extra_info_dict = dict()
-                    if param_or_buffer.name in state_names_dict:
-                        extra_info_dict['structured_name'] = state_names_dict[
-                            param_or_buffer.name]
-                    extra_info_dict[
-                        'stop_gradient'] = param_or_buffer.stop_gradient
-                    if isinstance(param_or_buffer, ParamBase):
-                        extra_info_dict['trainable'] = param_or_buffer.trainable
-                    extra_var_info[param_or_buffer.name] = extra_info_dict
+            with dygraph.guard():
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
+                        scr_tensor = param_or_buffer.value().get_map_tensor()
+                        tgt_var = scope.var(param_or_buffer.name)
+                        tgt_var.set_vocab(scr_tensor)
+                    else:
+                        param_or_buffer_tensor = scope.var(
+                            param_or_buffer.name).get_tensor()
+                        #src_tensor = param_or_buffer.value().get_tensor()
+                        src_tensor = state_var_dict[param_or_buffer.name].value(
+                        ).get_tensor()
+                        param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    if param_or_buffer.name not in extra_var_info:
+                        extra_info_dict = dict()
+                        if param_or_buffer.name in state_names_dict:
+                            extra_info_dict[
+                                'structured_name'] = state_names_dict[
+                                    param_or_buffer.name]
+                        extra_info_dict[
+                            'stop_gradient'] = param_or_buffer.stop_gradient
+                        if isinstance(param_or_buffer, ParamBase):
+                            extra_info_dict[
+                                'trainable'] = param_or_buffer.trainable
+                        extra_var_info[param_or_buffer.name] = extra_info_dict
 
         # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index ddb86848f842a85acc12dca1044a594c484c06fe..652916491eed7e511b610c2d00b0612604ecee8b 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -128,6 +128,9 @@ class ParallelEnv(object):
         elif core.is_compiled_with_npu():
             selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
             self._device_id = int(selected_npus[0])
+        elif core.is_compiled_with_mlu():
+            selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
+            self._device_id = int(selected_mlus[0])
 
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
@@ -560,13 +563,19 @@ class DataParallel(layers.Layer):
                  strategy=None,
                  comm_buffer_size=25,
                  last_comm_buffer_size=1,
-                 find_unused_parameters=False):
+                 find_unused_parameters=False,
+                 process_group=None,
+                 gradient_as_buffer_view=False,
+                 static_graph=False):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
         self.find_unused_parameters = find_unused_parameters
         self.grad_need_sync = True
+        self.process_group = process_group
+        self.gradient_as_buffer_view = gradient_as_buffer_view
+        self.static_graph = static_graph
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index e0c594b07aeb519dcb3906cdfc03d9af92117059..563cd433910054522b48b9b0f03a036d0d5abe69 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -29,6 +29,29 @@ final_state_name_mapping = {
         "x": "X",
         "y": "Y",
         "out": "Out",
+    },
+    "trunc": {
+        "final_op_name": "final_state_trunc",
+        "x": "X",
+        "out": "Out",
+    },
+    "abs": {
+        "final_op_name": "final_state_abs",
+        "x": "X",
+        "out": "Out",
+    },
+    "digamma": {
+        "final_op_name": "final_state_digamma",
+        "x": "X",
+        "out": "Out",
+    },
+    "diagonal": {
+        "final_op_name": "final_state_diagonal",
+        "x": "Input",
+        "offset": "offset",
+        "axis1": "axis1",
+        "axis2": "axis2",
+        "out": "Out",
     }
 }
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 65bfba3f6c32e072a6db0e1d294a8c5fc07d9d74..6843c0e4c3fa85f20b408e7536cf1902dafe3f45 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -94,7 +94,7 @@ def monkey_patch_varbase():
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T']
-        if isinstance(self, ParamBase):
+        if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
         else:
             attr_names = []
@@ -111,7 +111,7 @@ def monkey_patch_varbase():
 
         attr_kwargs.update(kwargs)
 
-        if to_parameter or isinstance(self, ParamBase):
+        if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
             del attr_kwargs['persistable']
             # NOTE(Aurelius84): All parameters should be placed into global block.
             attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e372727b0f0b6a338cd43ac81001bb32ffd03ecc..a7971763f53e1fa1ea445f8ac843a57ae00bd1c2 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2034,8 +2034,11 @@ class Executor(object):
             fleet_opt['task_id_to_rank'] = task_id_to_rank
         place = core.Place()
         place.set_place(self.place)
+        # NOTE: the last argument is used to force create some vars in root scope,
+        # won't be used during train.
         self._fleet_executor.init(carrier_id, program.desc, scope, place,
-                                  num_micro_batches, tasks, task_id_to_rank)
+                                  num_micro_batches, tasks, task_id_to_rank,
+                                  [])
 
     def _run_using_fleet_executor(self,
                                   program=None,
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 4bbc0ba03c9342afc4a0d2edee6c2b963ad6e0f8..a48cfd9150c657784f46fc6316a797c767fd5dd4 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
 @static_only
 def _legacy_save(param_dict, model_path, protocol=2):
     def get_tensor(var):
-        if isinstance(var, core.VarBase):
+        if isinstance(var, (core.VarBase, core.eager.Tensor)):
             return var.numpy()
         elif isinstance(var, core.LoDTensor):
             return np.array(var)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 56af7e341fdd61558265b1ee4ec1eff0c0f916fa..676ee3e3c774eab144468ce42f3c839daa948e4d 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -20,7 +20,7 @@ import string
 
 from six.moves import cStringIO
 from ..proto import framework_pb2
-from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode
+from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode, _in_eager_mode
 from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype
 from paddle import _C_ops
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f022e1791daefb7cbe18434aae8ac1dbc63d39c5..fd7226c48661fdb2cd4dcf7227d0f8383c6c9439 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
         'flatten')
+    if in_dygraph_mode():
+        return _C_ops.flatten2(x, 'axis', axis)[0]
+
     helper = LayerHelper('flatten', **locals())
 
     if not (isinstance(x, Variable)):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 76414ea942465d1e7a54084a2e4ee31b9ee41a2d..c63ad42288fd057d8456f31a675c9f1912bdc12f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -663,7 +663,9 @@ def assign(input, output=None):
             })
 
     if is_inplace and in_dygraph_mode():
-        output._bump_inplace_version()
+        # TODO(jiabin): Remove this when we support inplace
+        if not core._in_eager_mode():
+            output._bump_inplace_version()
 
     return output
 
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 727ceca72d1f1cfc0c34dae4e516568052136ba4..cbea289162c849b0efdca0ebfc48e0151dc1213e 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -332,8 +332,6 @@ class DataLoader(object):
         self.use_buffer_reader = use_buffer_reader
         self.worker_init_fn = worker_init_fn
 
-        assert isinstance(dataset, Dataset), \
-            "dataset should be subclass instance of paddle.io.Dataset"
         self.dataset = dataset
 
         if not return_list and not in_dygraph_mode():
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
old mode 100644
new mode 100755
index 2361bd270623873384d3cea8cd11eb10a78ec116..e75b8d1f60bf7dbbfb500a464a3b591a0d1f7ed3
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,8 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_attention)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -46,6 +48,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
+list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -116,7 +119,7 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
-if(ON_INFER)
+if(NOT WITH_PYTHON AND ON_INFER)
     LIST(REMOVE_ITEM TEST_OPS test_eager_trace_op)
 endif()
 
@@ -125,6 +128,17 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+endif()
+
+if (WITH_GPU)
+    if (CUDA_VERSION LESS 11.6)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+    endif()
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -270,6 +284,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -590,7 +605,7 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE)
     py_test_modules(test_warpctc_op MODULES test_warpctc_op)
     set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
 endif()
@@ -627,7 +642,7 @@ set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_passes)
-  
+    add_subdirectory(ps)
     add_subdirectory(auto_parallel)
 
     # FIXME(typhoonzero): add these tests back
@@ -1102,15 +1117,16 @@ set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
@@ -1139,6 +1155,8 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
+        set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel_fused_attention PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
index 26170015ae8c249fb3a36d13285f5b34491acb3a..d9ddd6c88d727a4cca5e94cf19b122355f3ea6c5 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
         exe.run(startup_prog)
 
-        sparsity.prune_model(train_prog, sharding=True)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 0a9eaf34ba512b3dd8649453e6e8d4ed25154c89..80bc206ae7b7952aea55cb93bd42346dc019633b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -5,7 +5,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
-    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
+    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
+    set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c71c792bf07d0ade5bb024d8087407cde010a6f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+PP_MESH_0 = auto.ProcessMesh([0])
+PP_MESH_1 = auto.ProcessMesh([1])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = auto.shard_op(
+            self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0]
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = auto.shard_op(
+            self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0]
+        out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+def train():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+
+    dataset = MyDataset(batch_num * batch_size)
+    data_spec = [
+        InputSpec([batch_size, hidden_size], 'float32', 'x'),
+        InputSpec([batch_size], 'int64', 'label')
+    ]
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    engine = Engine(mlp, data_spec, strategy=dist_strategy)
+    engine.prepare(optimizer, loss)
+    engine.fit(dataset,
+               batch_size=batch_size,
+               steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
index 1cd8f8f3e7083d61bd4a30ca114d0ac2a099ba47..07e6a2c4346da42fd9ff0aafef12bf453cc6f463 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
@@ -174,6 +174,7 @@ def get_program():
             dtype='float32')
         label = static.data(
             name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+
         data_holder = [input, label]
         # dataloader
         dataloader = paddle.io.DataLoader.from_generator(
@@ -194,6 +195,17 @@ def get_program():
                 "dims_mapping": [-1, -1, -1]
             })
 
+        # fill constant bsz like
+        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
+            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
+        auto.shard_tensor(
+            tmp,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, 0, -1, -1]
+            })
+
+        # model
         mlp_start = MLPLayer(
             hidden_size=hidden_size,
             intermediate_size=4 * hidden_size,
@@ -395,6 +407,9 @@ def completion(train_program, start_program, dist_context):
                         op_dist_attr.impl_idx = 0
                     else:
                         op_dist_attr.impl_idx = 1
+            elif op.type == "fill_constant_batch_size_like":
+                op_dist_attr.impl_type = "fill_constant_batch_size_like"
+                op_dist_attr.impl_idx = 0
             else:
                 op_dist_attr.impl_type = "default"
                 op_dist_attr.impl_idx = 0
@@ -428,6 +443,12 @@ class TestMLP(unittest.TestCase):
         dist_main_prog, dist_startup_prog = partition(
             train_program, start_program, dist_context)
         global_block_ops = dist_main_prog.blocks[0].ops
+
+        fill_op = None
+        for op in global_block_ops:
+            if op.type == "fill_constant_batch_size_like":
+                fill_op = op
+
         global_block_ops = [op.type for op in global_block_ops]
         sub_block_ops = dist_main_prog.blocks[1].ops
         sub_block_ops = [op.type for op in sub_block_ops]
@@ -435,6 +456,13 @@ class TestMLP(unittest.TestCase):
         self.assertTrue("c_allreduce_sum" in global_block_ops)
         self.assertTrue("c_allreduce_sum" in sub_block_ops)
 
+        # test fill_constant_batch_size_like
+
+        self.assertTrue(fill_op is not None)
+        ref_shape = [-1, 8, 0, 48]
+        shape = fill_op.attr("shape")
+        self.assertTrue(ref_shape == shape)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index 0fc1ea41033e00543054aa82949a583c6b0cf00f..a7d51a7e176d475763f7368c509dd926e81d0b0f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,122 +13,35 @@
 # limitations under the License.
 
 import unittest
-import time
-import paddle.fluid as fluid
-import copy
 import os
-import numpy as np
+import sys
+import shutil
 import subprocess
-import paddle
-import paddle.nn as nn
-import paddle.fluid as fluid
-import paddle.static as static
-import paddle.nn.functional as F
-import paddle.utils as utils
-from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
-
-paddle.enable_static()
-global_process_mesh = auto.ProcessMesh(mesh=[0])
-batch_size = 1
-batch_num = 10
-hidden_size = 1024
-sequence_len = 512
-image_size = hidden_size
-class_num = 10
-
-paddle.seed(44)
-
-
-class MyDataset(Dataset):
-    def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
-        self.num_samples = num_samples
-
-    def __getitem__(self, index):
-        input = np.random.uniform(size=image_size).astype("float32")
-        label = np.random.randint(0, class_num - 1, dtype="int64")
-        return input, label
-
-    def __len__(self):
-        return self.num_samples
-
-
-class MLPLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
-        super(MLPLayer, self).__init__()
-        d_model = hidden_size
-        dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
-        bias_attr = None
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
-        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
-
-    def forward(self, input):
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mappig": [-1]
-            })
-        # out = self.norm(input)
-        out = self.linear0(input)
-        out = F.gelu(out, approximate=True)
-        out = self.linear1(out)
-        # out = self.dropout(out)
-        out = self.linear2(out)
-        return out
+from paddle.distributed.fleet.launch_utils import run_with_coverage
 
 
 class TestEngineAPI(unittest.TestCase):
     def test_engine_api(self):
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
-        loss = paddle.nn.CrossEntropyLoss()
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "engine_api.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
 
-        dataset = MyDataset(batch_num * batch_size)
-        data_spec = [
-            InputSpec([batch_size, hidden_size], 'float32', 'x'),
-            InputSpec([batch_size], 'int64', 'label')
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
         ]
 
-        dist_strategy = fleet.DistributedStrategy()
-        dist_strategy.amp = False
-        dist_strategy.pipeline = False
-        dist_strategy.recompute = False
-        # init parallel optimizer
-        dist_strategy.semi_auto = True
-        fleet.init(is_collective=True, strategy=dist_strategy)
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
 
-        engine = Engine(mlp, data_spec, strategy=dist_strategy)
-        engine.prepare(optimizer, loss)
-        engine.fit(dataset,
-                   batch_size=batch_size,
-                   steps_per_epoch=batch_num * batch_size)
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 2277c69674b3faf4f2fddc43b8032152a465fd42..22692fa5debfccea65da32837819818023eaf80b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -32,6 +32,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer
 from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint, load_checkpoint_into_program
 from paddle.distributed.auto_parallel.utils import get_dist_attr, merge_and_slice_parameter, load_parameter_into_program
 from paddle.distributed.auto_parallel.reshard import HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -185,6 +186,7 @@ class TestMLPAutoConvert(unittest.TestCase):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_mp2pp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
@@ -211,6 +213,7 @@ class TestMLPAutoConvert(unittest.TestCase):
                           fetch_list=[loss])
         last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
         global PP_MESH_0
@@ -266,6 +269,7 @@ class TestMLPAutoConvert2(unittest.TestCase):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_pp2mp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
@@ -302,6 +306,7 @@ class TestMLPAutoConvert2(unittest.TestCase):
         if paddle.distributed.get_rank() in [1]:
             last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
 
@@ -345,6 +350,7 @@ class TestMLPAutoConvertInvalid(unittest.TestCase):
         np.random.seed(2021)
 
     def test_input_invalid(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index e0529c5d5f82cfccb1fd47705b2a4cda39c17827..00d2a1f71d6bd3f1d8ce3b8981be2b4732163340 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             grad_clip = kwargs.get('grad_clip', None)
             clip_after_allreduce = kwargs.get('clip_after_allreduce', True)
 
+            parameters = [p.name for p in main.all_parameters()]
+            exclude_fn = lambda var: var.name in parameters[::4]
+            kwargs['exclude_from_weight_decay_fn'] = exclude_fn
+            kwargs['lamb_weight_decay'] = 0.1
+
             if use_distributed_lamb:
                 optimizer_class = DistributedFusedLamb
                 kwargs = dict(kwargs)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
index 63dd4b8e21e074654c528a6807e7c91e0b32141c..93a0044a5e43c33161d08095e56cd5ac66ff8d1d 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
@@ -23,13 +23,24 @@ import unittest
 import numpy as np
 from collections import OrderedDict
 from paddle.distributed.ps.utils.public import logger
-from dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
+from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
 import paddle.distributed.fleet as fleet
 
 
 class PsPassTestBase(unittest.TestCase):
     def init(self):
-        raise NotImplementedError
+        self.config = {}
+        self.config['ps_mode_config'] = ""
+        self.config['worker_num'] = "1"
+        self.config['server_num'] = "1"
+        self.config['run_minimize'] = "0"
+        self.config['run_single_pass'] = "0"
+        self.config['run_the_one_ps'] = '0'
+        self.config['debug_new_minimize'] = "0"
+        self.config['debug_new_pass'] = "0"
+        self.config['debug_the_one_ps'] = '0'
+        self.config['log_dir'] = ""
+        self.config['applied_pass_name'] = ""
 
     def setUp(self):
         print('Ps setUp...')
@@ -37,7 +48,7 @@ class PsPassTestBase(unittest.TestCase):
     def tearDown(self):
         print('Ps tearDown...')
 
-    def ps_launch(self, config, ps_mode="cpu-ps"):
+    def ps_launch(self, ps_mode="cpu-ps"):
         if ps_mode == "cpu-ps" or ps_mode == 'heter-ps':
             os.environ['WITH_DISTRIBUTE'] = 'ON'
 
@@ -45,23 +56,26 @@ class PsPassTestBase(unittest.TestCase):
                 sys.executable,
                 "-u",
             ] + [
-                "-m", "launch", "--log_dir", config['log_dir'], "--worker_num",
-                config['worker_num'], "--server_num", config['server_num']
+                "-m", "launch", "--log_dir", self.config['log_dir'],
+                "--worker_num", self.config['worker_num'], "--server_num",
+                self.config['server_num']
             ]
             if ps_mode == 'heter-ps':
                 os.environ['FLAGS_START_PORT'] = '12004'
                 cmd += [
-                    '--heter_worker_num', config['heter_worker_num'],
-                    '--heter_devices', config['heter_devices']
+                    '--heter_worker_num', self.config['heter_worker_num'],
+                    '--heter_devices', self.config['heter_devices']
                 ]
 
             cmd += [
-                "../ps/ps_dnn_trainer.py", "-m", config['ps_mode_config'],
-                "--run_minimize", config['run_minimize'], "--run_single_pass",
-                config['run_single_pass'], "--debug_new_pass",
-                config['debug_new_pass'], "--debug_new_minimize",
-                config['debug_new_minimize'], "--applied_pass_name",
-                config['applied_pass_name']
+                "../ps/ps_dnn_trainer.py", "-m", self.config['ps_mode_config'],
+                "--run_minimize", self.config['run_minimize'],
+                "--run_single_pass", self.config['run_single_pass'],
+                "--run_the_one_ps", self.config['run_the_one_ps'],
+                "--debug_new_pass", self.config['debug_new_pass'],
+                "--debug_new_minimize", self.config['debug_new_minimize'],
+                "--applied_pass_name", self.config['applied_pass_name'],
+                "--debug_the_one_ps", self.config['debug_the_one_ps']
             ]
         elif ps_mode == "gpu-ps":
             os.environ['FLAGS_LAUNCH_BARRIER'] = '0'
@@ -80,12 +94,14 @@ class PsPassTestBase(unittest.TestCase):
 
             cmd = [
                 sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m",
-                config['ps_mode_config'], "--run_minimize",
-                config['run_minimize'], "--run_single_pass",
-                config['run_single_pass'], "--debug_new_pass",
-                config['debug_new_pass'], "--debug_new_minimize",
-                config['debug_new_minimize'], "--applied_pass_name",
-                config['applied_pass_name']
+                self.config['ps_mode_config'], "--run_minimize",
+                self.config['run_minimize'], "--run_single_pass",
+                self.config['run_single_pass'], "--run_the_one_ps",
+                self.config['run_the_one_ps'], "--debug_new_pass",
+                self.config['debug_new_pass'], "--debug_new_minimize",
+                self.config['debug_new_minimize'], "--applied_pass_name",
+                self.config['applied_pass_name'], "--debug_the_one_ps",
+                self.config['debug_the_one_ps']
             ]
 
         cmd = [shlex.quote(c) for c in cmd]
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index b186869ee9747fdc2b5c51ecc5051ab6f93f3706..877136cf6ed0eca04a7fb5907c9b05139f597a60 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -21,31 +21,26 @@ import numpy as np
 
 import paddle
 from ps_pass_test_base import *
-from paddle.distributed.ps.utils.public import logger
+from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
 from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer
 
 
 class TestPsTrainerPass(PsPassTestBase):
-    def init(self):
-        self.config = {}
-        self.config['ps_mode_config'] = ""
-        self.config['worker_num'] = "1"
-        self.config['server_num'] = "1"
-        self.config['run_minimize'] = "0"
-        self.config['run_single_pass'] = "0"
-        self.config['debug_new_minimize'] = "0"
-        self.config['debug_new_pass'] = "0"
-        self.config['log_dir'] = ""
-        self.config['applied_pass_name'] = ""
-
     def setUp(self):
         pass
 
     def tearDown(self):
         pass
 
-    def check(self):
-        pass
+    def check(self, file1, file2):
+        with open(file1, 'r', encoding='utf-8') as f:
+            text1 = f.read()
+        with open(file2, 'r', encoding='utf-8') as f:
+            text2 = f.read()
+        if text1 == text2:
+            return True
+        else:
+            return False
 
     def test_ps_optimizer_minimize_cpu_async(self):
         self.init()
@@ -53,16 +48,21 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/async_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/async_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
-        self.check()
+        file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_async passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_async failed!')
 
     def test_ps_optimizer_minimize_cpu_sync(self):
         self.init()
@@ -70,16 +70,22 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/sync_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/sync_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
-
-        self.check()
+        self.ps_launch()
+        '''
+        file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_sync failed!')
+        '''
 
     def test_ps_optimizer_minimize_cpu_geo(self):
         self.init()
@@ -87,16 +93,21 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/geo_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/geo_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
-        self.check()
+        file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_geo passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_geo failed!')
 
     # heter ps 二阶段
     def test_ps_optimizer_minimize_heter(self):
@@ -110,14 +121,24 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['ps_mode_config'] = "../ps/heter_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/heter_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "heter_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, 'heter-ps')
+        self.ps_launch('heter-ps')
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/heter_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "heter_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, 'heter-ps')
+        self.ps_launch('heter-ps')
+        '''
+        file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
+        file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
+        file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
+        if self.check(file1, file2) and self.check(file3, file4):
+            logger.info('test_ps_optimizer_minimize_heter passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_heter failed!')
+        '''
 
     def test_ps_optimizer_minimize_gpu(self):
         self.init()
@@ -125,29 +146,46 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
-        self.ps_launch(self.config, "gpu-ps")
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_old_minimize"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch("gpu-ps")
 
         self.config['debug_new_minimize'] = '1'
-        self.ps_launch(self.config, "gpu-ps")
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_new_minimize"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch("gpu-ps")
 
-        self.check()
+        file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_gpu passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_gpu failed!')
 
     def test_append_send_ops_pass(self):
         self.init()
         self.config['run_single_pass'] = '1'
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
         self.config['applied_pass_name'] = "append_send_ops_pass"
 
         self.config['debug_new_pass'] = '0'
-        self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name']
+        self.config['log_dir'] = ps_log_root_dir + "log_old_" + self.config[
+            'applied_pass_name']
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, "cpu-ps")
+        self.ps_launch("cpu-ps")
 
         self.config['debug_new_pass'] = '1'
-        self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name']
+        self.config['log_dir'] = ps_log_root_dir + "log_new_" + self.config[
+            'applied_pass_name']
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, "cpu-ps")
-
-        self.check()
+        self.ps_launch("cpu-ps")
+
+        file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_append_send_ops_pass passed!')
+        else:
+            logger.info('test_append_send_ops_pass failed!')
 
     def test_distributed_ops_pass(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4832782c329afe1813e643bee10e57194883894
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import shutil
+import tempfile
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+epoch = 10
+paddle.seed(2022)
+np.random.seed(2022)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+batch_size = 100
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.Momentum(
+        parameters=[{
+            "params": list(model.parameters())
+        }] if opt_group else list(model.parameters()),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model, shard_level, use_pure_fp16, output_dir):
+    group = paddle.distributed.new_group([0, 1])
+
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+
+    model, optimizer, scaler = group_sharded_parallel(
+        model=model, optimizer=optimizer, level=shard_level, scaler=scaler)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+
+            optimizer.clear_grad()
+
+    save_group_sharded_model(model, output=output_dir, optimizer=optimizer)
+    return model.parameters()
+
+
+def test_sharding_api():
+    mlp, mlp1, mlp2 = MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+
+    output_dir = tempfile.mkdtemp()
+
+    # fp16
+    stage2_params = train_mlp(
+        mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir)
+    stage3_params = train_mlp(
+        mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir)
+
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage3_params[i].numpy(),
+            rtol=1e-4,
+            atol=1e-3)
+    shutil.rmtree(output_dir)
+
+
+if __name__ == '__main__':
+    test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 06935e212c3cb19519c558042cea3210910a8975..fb01fd46c0d28455067fd44139e177a81b25566a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -14,8 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
 import numpy as np
 import argparse
+import tempfile
 import ast
 import time
 import paddle
@@ -88,7 +91,8 @@ def train_mlp(model,
               batch_size=100,
               use_pure_fp16=False,
               accumulate_grad=False,
-              opt_group=False):
+              opt_group=False,
+              save_model=False):
     if sharding_stage == "dp":
         hcg = fleet.get_hybrid_communicate_group()
         group = hcg.get_check_parallel_group()
@@ -147,6 +151,9 @@ def train_mlp(model,
         if accumulate_grad:
             optimizer.step()
             optimizer.clear_grad()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
@@ -158,11 +165,13 @@ def test_dp_stage2():
     mlp3 = MLP()
     mlp4 = MLP()
     mlp5 = MLP()
+    mlp6 = MLP()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
     mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
 
     # DP VS stage2
     dp_params = train_mlp(
@@ -186,10 +195,29 @@ def test_dp_stage2():
 
     # stage2 param list VS param group
     stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+        mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True)
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
             dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage2, optimizer_stage2 = train_mlp(
+        mlp6,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage2.state_dict(), model_file)
+    paddle.save(optimizer_stage2.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage2.set_state_dict(m_state_dict)
+    optimizer_stage2.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index 6b755cf4c2b593e68c89decba0eeeaf601a37573..82821cd7ee644b5209a594e9a43de7636cdd4958 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -14,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
+import tempfile
 import numpy as np
 import argparse
 import ast
@@ -83,8 +86,9 @@ def train_mlp(model,
               accumulate_grad=False,
               batch_size=100,
               opt_group=False,
-              recompute=False,
-              test_minimize=False):
+              sync_comm=False,
+              test_minimize=False,
+              save_model=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
         optimizer = optimizer_setting(
@@ -104,7 +108,7 @@ def train_mlp(model,
             model, optimizer, group=group, buffer_max_size=2**21)
     elif sharding_stage == 3:
         model = ShardingStage3(
-            model, optimizer=optimizer, group=group, sync_comm=recompute)
+            model, optimizer=optimizer, group=group, sync_comm=sync_comm)
 
     # check optimizer.minimize() error
     if test_minimize:
@@ -162,12 +166,15 @@ def train_mlp(model,
             optimizer.clear_grad()
     if sharding_stage == 3:
         model.get_all_parameters()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
 def test_stage2_stage3():
-    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9 = MLP(), MLP(
-    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
     state_dict = mlp.state_dict()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
@@ -178,6 +185,7 @@ def test_stage2_stage3():
     mlp7.set_state_dict(state_dict)
     mlp8.set_state_dict(state_dict)
     mlp9.set_state_dict(state_dict)
+    mlp10.set_state_dict(state_dict)
 
     # fp32 
     stage2_params = train_mlp(
@@ -225,7 +233,7 @@ def test_stage2_stage3():
             rtol=1e-4,
             atol=1e-3)
 
-    # fp16 recompute
+    # fp16 sync_comm
     stage3_params = train_mlp(
         mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
     stage3_params_re = train_mlp(
@@ -233,14 +241,32 @@ def test_stage2_stage3():
         sharding_stage=3,
         use_pure_fp16=True,
         opt_group=False,
-        recompute=True)
+        sync_comm=True)
     for i in range(len(stage3_params)):
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
 
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage3, optimizer_stage3 = train_mlp(
+        mlp9,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage3.state_dict(), model_file)
+    paddle.save(optimizer_stage3.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage3.set_state_dict(m_state_dict)
+    optimizer_stage3.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     # check optimizer.minimize() error
     train_mlp(
-        mlp9,
+        mlp10,
         sharding_stage=3,
         use_pure_fp16=False,
         opt_group=False,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 30c1955adcf9f652e77b53eca8375955fb26eb35..c6f491a5484d9f1601993e1dd581e46008f0e27a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -520,6 +520,7 @@ def predict_static(args, data):
     paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
+
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
          args.model_save_dir,
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..959700ad743b40420200b56055354279386a9a7c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(WITH_IPU)
+    file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+    string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+    foreach(TEST_OP ${TEST_OPS})
+        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    endforeach(TEST_OP)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
deleted file mode 100644
index ddda666db2c0cb043624ff7249d0ea08c455c0a4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
+++ /dev/null
@@ -1,934 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# refrenece : https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/ernie
-
-import os
-import copy
-import argparse
-from contextlib import contextmanager
-from functools import partial
-
-import numpy as np
-import paddle
-import paddle.static
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.compiler as compiler
-paddle.enable_static()
-
-SEED = 2021
-INT_DTYPE = None
-
-# ernie related block 
-ernie_config = {
-    "emb_size": 128,
-    "emb_mapping_in": False,
-    "hidden_size": 192,
-    "num_hidden_layers": 2,
-    "n_layer_per_block": 2,
-    "num_attention_heads": 12,
-    "vocab_size": 300,
-    "max_position_embeddings": 512,
-    "sent_type_vocab_size": 4,
-    "task_type_vocab_size": 16,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "attention_probs_dropout_prob": 0.0,
-    "preln": False,
-    "pre_encoder_cmd": "n",
-    "preprocess_cmd": "",
-    "postprocess_cmd": "an",
-    "epsilon": 1e-12,
-    "initializer_range": 0.02,
-    "seq_len": 32
-}
-
-
-def gelu(x):
-    """Gaussian Error Linear Unit.
-
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-      x: float Tensor to perform activation.
-
-    Returns:
-      `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + fluid.layers.tanh(
-        (np.sqrt(2.0 / np.pi) * (x + 0.044715 * fluid.layers.pow(x, 3.0)))))
-    return x * cdf
-
-
-def pre_post_process_layer(prev_out,
-                           out,
-                           process_cmd,
-                           dropout_rate=0.,
-                           epsilon=1e-12,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_scale',
-                    initializer=fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_bias',
-                    initializer=fluid.initializer.Constant(0.)),
-                epsilon=epsilon)
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    dropout_implementation="upscale_in_train",
-                    is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-
-    #assert hidden_act == 'gelu.approximate'
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=None,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    hidden = gelu(hidden)
-
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test=False)
-
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(
-                        name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-
-    return out
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_query_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_key_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_value_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=False)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=False)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(
-                             name=name + '_output_fc.w_0',
-                             initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-
-    return proj_out
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name='',
-                  epsilon=1e-12):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-
-    attn_output = multi_head_attention(
-        enc_input,
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer=param_initializer,
-        name=name + '_multi_head_att')
-
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        'an',
-        prepostprocess_dropout,
-        name=name + '_post_att',
-        epsilon=epsilon)
-
-    ffd_output = positionwise_feed_forward(
-        attn_output,
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer=param_initializer,
-        name=name + '_ffn')
-
-    post_output = post_process_layer(
-        attn_output,
-        ffd_output,
-        'an',
-        prepostprocess_dropout,
-        name=name + '_post_ffn',
-        epsilon=epsilon)
-
-    return post_output
-
-
-def encoder_inner_share(enc_input,
-                        attn_bias,
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        hidden_act,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        epsilon,
-                        param_initializer=None,
-                        name='',
-                        n_layer_per_block=1):
-    """
-       The encoder_inner_share is composed of n_layer_per_block layers returned by calling
-       encoder_layer.
-    """
-
-    for i in range(n_layer_per_block):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer=param_initializer,
-            name=name + '_layer_' + str(i),
-            epsilon=epsilon)
-
-        enc_input = enc_output
-
-    return enc_output
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            epsilon,
-            n_layer_per_block,
-            param_initializer=None,
-            name='',
-            preln=False):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer .
-    """
-
-    for _ in range(n_layer // n_layer_per_block):
-        attn_bias.stop_gradient = True
-        attn_bias.persistable = False
-        enc_output = encoder_inner_share(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            epsilon,
-            param_initializer=param_initializer,
-            name=name,
-            n_layer_per_block=n_layer_per_block)
-
-        enc_input = enc_output
-
-    if preln:
-        enc_output = post_process_layer(
-            None,
-            enc_output,
-            'n',
-            prepostprocess_dropout,
-            name='post_encoder',
-            epsilon=epsilon)
-
-    enc_output = pre_process_layer(
-        enc_output,
-        preprocess_cmd,
-        prepostprocess_dropout,
-        name="post_encoder",
-        epsilon=epsilon)
-
-    return enc_output
-
-
-class ErnieModel(object):
-    def __init__(self, src_ids, sent_ids, pos_ids, input_mask, config):
-
-        self._emb_size = config['emb_size'] if config[
-            'emb_mapping_in'] else config['hidden_size']
-        self._hidden_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['sent_type_vocab_size']
-        self._task_types = config['task_type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self.config = config
-        self.preln = config['preln'] if 'preln' in config.keys() else False
-        self.pre_encoder_cmd = "" if self.preln else self.config[
-            'pre_encoder_cmd']
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._task_emb_name = "task_embedding"
-        self._dtype = "float32"
-        self._emb_dtype = "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
-
-        self.src_ids = src_ids
-        self.sent_ids = sent_ids
-        self.pos_ids = pos_ids
-        self.input_mask = input_mask
-        '''
-        _build_position_ids: range op doesn't support
-        _build_input_mask: logic_not op doesn't support
-        '''
-
-        self._build_model()
-
-    def _build_model(self, emb=None):
-        with fluid.ipu_shard(ipu_index=0, ipu_stage=0):
-            # padding id in vocabulary must be set to 0
-            self.emb_out = fluid.layers.embedding(
-                input=self.src_ids,
-                size=[self._voc_size, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._word_emb_name,
-                    initializer=self._param_initializer),
-                is_sparse=False)
-
-            self.position_emb_out = fluid.layers.embedding(
-                input=self.pos_ids,
-                size=[self._max_position_seq_len, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._pos_emb_name,
-                    initializer=self._param_initializer))
-
-            self.sent_emb_out = fluid.layers.embedding(
-                self.sent_ids,
-                size=[self._sent_types, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._sent_emb_name,
-                    initializer=self._param_initializer))
-
-            sum_emb = self.emb_out + self.position_emb_out + self.sent_emb_out
-
-            sum_emb = pre_process_layer(
-                sum_emb,
-                self.config['pre_encoder_cmd'],
-                self._prepostprocess_dropout,
-                name='pre_encoder',
-                epsilon=self.config['epsilon'])
-
-            if self.config['emb_mapping_in']:
-                sum_emb = fluid.layers.fc(
-                    input=sum_emb,
-                    num_flatten_dims=2,
-                    size=self._hidden_size,
-                    param_attr=fluid.ParamAttr(
-                        name='emb_hidden_mapping',
-                        initializer=self._param_initializer),
-                    bias_attr='emb_hidden_mapping_bias')
-
-            self_attn_mask = fluid.layers.matmul(
-                x=self.input_mask, y=self.input_mask, transpose_y=True)
-
-            self_attn_mask = fluid.layers.scale(
-                x=self_attn_mask,
-                scale=10000.0,
-                bias=-1.0,
-                bias_after_scale=False)
-
-        with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-            n_head_self_attn_mask = fluid.layers.stack(
-                x=[self_attn_mask] * self._n_head,
-                axis=1)  # [bs, _n_head, seqlen, seq_len]
-            n_head_self_attn_mask.stop_gradient = True
-
-            self._enc_out = encoder(
-                enc_input=sum_emb,
-                attn_bias=n_head_self_attn_mask,
-                n_layer=self._n_layer,
-                n_head=self._n_head,
-                d_key=self._hidden_size // self._n_head,
-                d_value=self._hidden_size // self._n_head,
-                d_model=self._hidden_size,
-                d_inner_hid=self._hidden_size * 4,
-                prepostprocess_dropout=self._prepostprocess_dropout,
-                attention_dropout=self._attention_dropout,
-                relu_dropout=0,
-                hidden_act=self._hidden_act,
-                preprocess_cmd=self.config['preprocess_cmd'],
-                postprocess_cmd=self.config['postprocess_cmd'],
-                param_initializer=self._param_initializer,
-                name='encoder',
-                epsilon=self.config['epsilon'],
-                n_layer_per_block=self.config['n_layer_per_block'],
-                preln=self.preln)
-
-    def _build_position_ids(self):
-        d_shape = fluid.layers.shape(self.src_ids)
-        d_seqlen = d_shape[1]
-        d_batch = d_shape[0]
-        position_ids = fluid.layers.reshape(
-            fluid.layers.range(
-                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
-            inplace=False)
-        position_ids = fluid.layers.expand(position_ids, [d_batch, 1, 1])
-        position_ids = fluid.layers.cast(position_ids, INT_DTYPE)
-        position_ids.stop_gradient = True
-        return position_ids
-
-    def _build_input_mask(self):
-        zero = fluid.layers.fill_constant([1], dtype=INT_DTYPE, value=0)
-        input_mask = fluid.layers.logical_not(
-            fluid.layers.equal(self.src_ids, zero))  # assume pad id == 0
-        input_mask = fluid.layers.cast(input_mask, 'float32')
-        input_mask.stop_gradient = True
-        return input_mask
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-        next_sent_feat = fluid.layers.slice(
-            input=self._enc_out, axes=[1], starts=[0], ends=[1])
-
-        next_sent_feat = fluid.layers.fc(
-            input=next_sent_feat,
-            size=self._hidden_size,
-            act="tanh",
-            param_attr=fluid.ParamAttr(
-                name="pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_next_sentence_output(self, labels):
-        next_sent_feat = self.get_pooled_output()
-        next_sent_fc_out = fluid.layers.fc(
-            input=next_sent_feat,
-            num_flatten_dims=1,
-            size=33,
-            param_attr=fluid.ParamAttr(
-                name="next_sent_fc.w_0", initializer=self._param_initializer),
-            bias_attr="next_sent_fc.b_0")
-        next_sent_fc_out = fluid.layers.reshape(
-            next_sent_fc_out, [-1, 33], inplace=False)
-        #next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
-        #    logits=next_sent_fc_out, label=labels, return_softmax=True)
-        next_sent_softmax = fluid.layers.softmax(next_sent_fc_out)
-        next_sent_loss = fluid.layers.cross_entropy(next_sent_softmax, labels)
-        next_sent_acc = fluid.layers.accuracy(
-            input=next_sent_softmax, label=labels)
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss,
-                                                "mean_next_sent_loss")
-        return next_sent_acc, mean_next_sent_loss
-
-    def get_lm_output(self, mask_label, mask_pos):
-        """Get the loss & accuracy for pretraining"""
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        reshaped_emb_out = fluid.layers.reshape(
-            x=self._enc_out, shape=[-1, self._hidden_size])
-
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-        if self._dtype == "float16":
-            mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype)
-
-        # transform: fc
-        if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise':
-            _hidden_act = 'gelu'
-        else:
-            _hidden_act = None
-
-        mask_trans_feat = fluid.layers.fc(
-            input=mask_feat,
-            size=self._emb_size,
-            act=_hidden_act,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_fc.w_0',
-                initializer=self._param_initializer),
-            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-
-        if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise':
-            pass
-        else:
-            mask_trans_feat = gelu(mask_trans_feat)
-
-        # transform: layer norm
-        mask_trans_feat = fluid.layers.layer_norm(
-            mask_trans_feat,
-            begin_norm_axis=len(mask_trans_feat.shape) - 1,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_layer_norm_scale',
-                initializer=fluid.initializer.Constant(1.)),
-            bias_attr=fluid.ParamAttr(
-                name='mask_lm_trans_layer_norm_bias',
-                initializer=fluid.initializer.Constant(0.)),
-            epsilon=self.config['epsilon'])
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(
-            name="mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0))
-
-        fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                 size=self._voc_size,
-                                 param_attr=fluid.ParamAttr(
-                                     name="mask_lm_out_fc.w_0",
-                                     initializer=self._param_initializer),
-                                 bias_attr=mask_lm_out_bias_attr)
-        #mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-        #    logits=fc_out, label=mask_label)
-        mask_lm_softmax = fluid.layers.softmax(fc_out)
-        mask_lm_loss = fluid.layers.cross_entropy(mask_lm_softmax, mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(
-            mask_lm_loss, name="mean_mask_lm_loss")
-
-        return mask_lm_loss, mean_mask_lm_loss
-
-    def get_task_output(self, task, task_labels):
-        task_fc_out = fluid.layers.fc(input=self.next_sent_feat,
-                                      size=task["num_labels"],
-                                      param_attr=fluid.ParamAttr(
-                                          name=task["task_name"] + "_fc.w_0",
-                                          initializer=self._param_initializer),
-                                      bias_attr=task["task_name"] + "_fc.b_0")
-        #task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
-        #    logits=task_fc_out, label=task_labels, return_softmax=True)
-        task_softmax = fluid.layers.softmax(task_fc_out)
-        task_loss = fluid.layers.cross_entropy(task_softmax, task_labels)
-        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
-        mean_task_loss = fluid.layers.mean(task_loss)
-        return mean_task_loss, task_acc
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--run_on_ipu", type=bool, default=True, help="Run model with IPU")
-    parser.add_argument(
-        "--is_training", type=bool, default=True, help="Train of inference")
-    parser.add_argument(
-        "--num_ipus", type=int, default=2, help="Number of ipus")
-    parser.add_argument(
-        "--enable_pipelining", type=bool, default=False, help="Pipelining")
-    parser.add_argument(
-        "--save_model", type=bool, default=False, help="Save model or not")
-    parser.add_argument(
-        "--model_path", type=str, default="ernie", help="Save model to where")
-    parser.add_argument(
-        "--model_name", type=str, default="ernie", help="Save model name")
-    parser.add_argument(
-        "--ipu_run_steps", type=int, default=10, help="Number steps exe.run()")
-    parser.add_argument(
-        "--export_ops", type=bool, default=False, help="Export ops to ops.txt")
-    parser.add_argument(
-        "--export_ipu_idx", type=bool, default=False, help="Export op-idx pair")
-    args = parser.parse_args()
-
-    # set random seed
-    np.random.seed(SEED)
-    paddle.static.default_startup_program().random_seed = SEED
-    paddle.static.default_main_program().random_seed = SEED
-
-    # IPU doesn't support int64, so we change here
-    INT_DTYPE = "int32" if args.run_on_ipu else "int64"
-
-    # paddle input placeholder, batch_size = 1
-    micro_bs = 1
-    seq_len = ernie_config["seq_len"]
-    input_shape = [micro_bs, seq_len, 1]
-    input_fields = {
-        'names': [
-            'src_ids', 'sent_ids', 'pos_ids', 'input_mask', 'mask_label',
-            'mask_pos'
-        ],
-        'shapes': [
-            input_shape, input_shape, input_shape, input_shape, [micro_bs, 1],
-            [micro_bs, 1]
-        ],
-        'dtypes':
-        [INT_DTYPE, INT_DTYPE, INT_DTYPE, 'float32', INT_DTYPE, INT_DTYPE],
-        'range': [[0, seq_len], [0, 4], [0, seq_len], None, [0, seq_len],
-                  [0, seq_len]],
-        'lod_levels': [0, 0, 0, 0, 0, 0],
-    }
-
-    inputs = [
-        fluid.data(
-            name=input_fields['names'][i],
-            shape=input_fields['shapes'][i],
-            dtype=input_fields['dtypes'][i],
-            lod_level=input_fields['lod_levels'][i])
-        for i in range(len(input_fields['names']))
-    ]
-
-    # total_samples: assum disable pipelining
-    batches_per_step = 1
-    if args.enable_pipelining:
-        batches_per_step = \
-            ((args.num_ipus+1) if args.is_training else args.num_ipus)
-    total_samples = args.ipu_run_steps * batches_per_step
-
-    total_steps = args.ipu_run_steps
-    if not args.run_on_ipu:  # run on cpu
-        total_steps = total_samples // micro_bs
-
-    # synthetic data
-    np_inputs = []
-    for i in range(len(input_fields['names'])):
-        field_name = input_fields['names'][i]
-        if field_name == 'input_mask':
-            src_ids = np_inputs[0]
-            dtype = input_fields['dtypes'][i]
-            data = np.where(src_ids > 0,
-                            np.ones_like(src_ids),
-                            np.zeros_like(src_ids)).astype(dtype)
-        else:
-            shape = copy.copy(input_fields['shapes'][i])
-            shape[0] = total_samples
-            min_val, max_val = input_fields['range'][i]
-            data = np.random.randint(
-                min_val, max_val, shape, dtype=input_fields['dtypes'][i])
-        np_inputs.append(data)
-
-    # paddle input placeholder
-    (src_ids, sent_ids, pos_ids, input_mask, mask_label, mask_pos) = inputs
-
-    # ernie model
-    ernie = ErnieModel(src_ids, sent_ids, pos_ids, input_mask, ernie_config)
-    fetch_node = ernie.get_sequence_output()
-    if args.is_training:
-        with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-            _, mean_mask_lm_loss = ernie.get_lm_output(mask_label, mask_pos)
-            fetch_node = mean_mask_lm_loss
-            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-            adam.minimize(mean_mask_lm_loss)
-
-    # place = paddle.CPUPlace()
-    if args.run_on_ipu:
-        place = paddle.IPUPlace()
-    else:
-        place = paddle.CPUPlace()
-    executor = paddle.static.Executor(place)
-
-    # feed & fetch list
-    if args.is_training:
-        feed_list = input_fields['names']
-    else:
-        feed_list = input_fields['names'][:4]
-    fetch_list = [fetch_node.name]
-
-    # program
-    startup_prog = paddle.static.default_startup_program()
-    executor.run(startup_prog)
-
-    main_prog = paddle.static.default_main_program()
-    paddle.static.save(main_prog, "model/ernie")
-    paddle.static.load(main_prog, "model/ernie")
-
-    if args.run_on_ipu:
-        ipu_strategy = paddle.static.IpuStrategy()
-        ipu_strategy.SetGraphConfig(
-            num_ipus=args.num_ipus,
-            is_training=args.is_training,
-            enable_manual_shard=args.num_ipus > 1)
-        ipu_strategy.SetPipeliningConfig(
-            enable_pipelining=args.enable_pipelining,
-            batches_per_step=args.num_ipus + 1)
-
-        ipu_compiler = compiler.IPUCompiledProgram(
-            main_prog, ipu_strategy=ipu_strategy)
-        program = ipu_compiler.compile(feed_list, fetch_list)
-    else:
-        program = main_prog
-
-    # executor run
-    results = []
-    for i in range(total_steps):
-        start = i * (batches_per_step if args.run_on_ipu else 1)
-        end = start + (batches_per_step if args.run_on_ipu else 1)
-        feed_dict = {
-            src_ids.name: np_inputs[0][start:end],
-            sent_ids.name: np_inputs[1][start:end],
-            pos_ids.name: np_inputs[2][start:end],
-            input_mask.name: np_inputs[3][start:end]
-        }
-        if args.is_training:
-            feed_dict[mask_label.name] = np_inputs[4][start:end]
-            feed_dict[mask_pos.name] = np_inputs[5][start:end]
-
-        res = executor.run(program, feed=feed_dict, fetch_list=[fetch_node])
-        results.append(res)
-
-    paddle.static.save(main_prog, "model/ernie")
-
-    results = np.asarray(results).flatten()
-    if results.size > 32:
-        results = results[-32:]
-    print(results)
-
-    if args.save_model:
-        full_name = args.model_path + '/' + args.model_name
-        if args.is_training:
-            fluid.save(program=main_prog, model_path=full_name)
-        else:
-            with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-                paddle.static.save_inference_model(
-                    full_name, [src_ids, sent_ids, pos_ids, input_mask],
-                    [fetch_node], executor)
-
-    if args.export_ops:
-        op_type_list = []
-        for op in main_prog.global_block().ops:
-            op_type_list.append(op.desc.type())
-
-        with open("ops.txt", "w") as fp:
-            for op_type in set(op_type_list):
-                fp.write(op_type + os.linesep)
-
-    if args.export_ipu_idx:
-        op_ipu_idx_list = []
-        for op in main_prog.global_block().ops:
-            if op._is_backward_op():
-                continue
-
-            op_ipu_idx_pair = [op.desc.type()]
-            if op.desc.has_attr("ipu_index"):
-                op_ipu_idx_pair.append(op.desc.attr("ipu_index"))
-            else:
-                op_ipu_idx_pair.append(-1)  # not assign ipu_index
-            op_ipu_idx_list.append(op_ipu_idx_pair)
-        op_ipu_idx_list.sort(key=lambda item: item[-1])
-
-        with open("ops_ipu_idx.txt", "w") as fp:
-            for op_ipu_idx_pair in op_ipu_idx_list:
-                fp.write(str(op_ipu_idx_pair) + os.linesep)
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 0d09f604060012856cb18c44cb6484e23b976df0..790388f30ead9eb8675e63e93d59d9cd81670aea 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
 import unittest
-
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
-from typing import Optional
-import paddle.fluid.compiler as compiler
-
-SEED = 2021
+from enum import Enum
 
-ipu_compiler_ref: Optional[compiler.IPUCompiledProgram] = None
+import paddle
+import paddle.static
 
 map_np_dtype_to_fluid_dtype = {
     'bool': "bool",
@@ -36,6 +33,19 @@ map_np_dtype_to_fluid_dtype = {
 }
 
 
+class ExecutionMode(Enum):
+    CPU_FP32 = 1
+    IPU_FP32 = 2
+    # enable_fp16 through ipu_strategy.enable_fp16
+    IPU_POPART_FP16 = 3
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+    def __gt__(self, other):
+        return self.value > other.value
+
+
 def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
     return map_np_dtype_to_fluid_dtype[dtype.name]
 
@@ -43,14 +53,16 @@ def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
 class IPUOpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
+        # Get random seeds
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
 
-        cls.SEED = SEED
+        cls.SEED = 2021
         np.random.seed(cls.SEED)
         random.seed(cls.SEED)
 
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        # Enable paddle static graph mode
+        paddle.enable_static()
 
     @classmethod
     def tearDownClass(cls):
@@ -58,14 +70,47 @@ class IPUOpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
-        _set_use_system_allocator(cls._use_system_allocator)
-        # unittest will to trigger IPUCompiledProgram.__del__ automatically
-        global ipu_compiler_ref
-        ipu_compiler_ref is not None and ipu_compiler_ref.clean()
+    @classmethod
+    def use_ipumodel(cls):
+        if 'POPLAR_IPUMODEL' not in os.environ:
+            return False
+        else:
+            flag = os.environ['POPLAR_IPUMODEL']
+            if flag.upper() in ['1', "TRUE"]:
+                return True
 
     def set_atol(self):
-        self.atol = 1e-5
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
 
     def set_training(self):
         self.is_training = False
         self.epoch = 1
+
+    def check(self, outputs, check_shape=False):
+        cpu_fp32 = outputs[ExecutionMode.CPU_FP32]
+        ipu_fp32 = outputs[ExecutionMode.IPU_FP32]
+        max_diff = np.abs(cpu_fp32 - ipu_fp32).max()
+        fp32_flag = np.allclose(
+            cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol)
+        self.assertTrue(fp32_flag, "max diff is %f" % (max_diff))
+
+        if check_shape:
+            self.assertTrue(cpu_fp32.shape == ipu_fp32.shape)
+
+        ipu_popart_fp16 = None
+        if ExecutionMode.IPU_POPART_FP16 in outputs.keys():
+            ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16]
+            max_diff = np.abs(ipu_popart_fp16.astype(np.float32) -
+                              cpu_fp32).max()
+            fp16_flag = np.allclose(
+                ipu_popart_fp16.astype(np.float32),
+                cpu_fp32,
+                rtol=self.rtol_fp16,
+                atol=self.atol_fp16)
+            self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
+
+            if check_shape:
+                self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..138365b650f24c59690530aceeefb96ed52cc165
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
@@ -0,0 +1,133 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestRelu(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.relu
+        self.op_attrs = {}
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = self.op(x, **self.op_attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTanh(TestRelu):
+    def set_test_op(self):
+        self.op = F.tanh
+        self.op_attrs = {}
+
+
+class TestLog(TestRelu):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.log
+        self.op_attrs = {}
+
+
+class TestSigmoid(TestRelu):
+    def set_test_op(self):
+        self.op = F.sigmoid
+        self.op_attrs = {}
+
+
+class TestSqrt(TestRelu):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sqrt
+        self.op_attrs = {}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..d14eba98ef5d7a0d7fb083d4e51e65c1ec732547
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -0,0 +1,117 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[10, 1000])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {"axis": -1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.argmax(x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0].astype(np.int32)
+
+    def test_base(self):
+        output_dict_fp32 = {}
+        output_dict_fp16 = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+
+            if mode > ExecutionMode.IPU_FP32:
+                output_dict_fp16[mode] = self._test_base(mode).flatten()
+            else:
+                output_dict_fp32[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict_fp32)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"axis": 0}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f17c90de72ad62814c3653700ae21fd6f205b5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                assign = paddle.assign(x)
+                out = paddle.fluid.layers.elementwise_add(assign, assign)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index a23cacf47636b434b3211ca377e8e6a5e79fa64b..f34e5b0d8b9dc60a5f06187b67d48a128923bdcc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,78 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 2e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 128, 128])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
+
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    conv1, num_filters=3, filter_size=3, bias_attr=False)
-                conv3 = paddle.static.nn.conv2d(
-                    conv2, num_filters=3, filter_size=3, bias_attr=False)
-                conv4 = paddle.static.nn.conv2d(
-                    conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+                fetch_list = [x.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    num_ipus=2,
-                    is_training=self.is_training,
-                    enable_manual_shard=True,
-                    need_avg_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                ipu_strategy.set_options({'need_avg_shard': True})
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 87f783dbd1c1aef2f5bcc40b407dff9f4bbe0916..1dab958c1ecbc806df94c651cf4a2d6cd82f3ddb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -16,13 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +27,100 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = False
         self.attrs['data_layout'] = 'NCHW'
         self.attrs['in_place'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 out = paddle.fluid.layers.batch_norm(conv1, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = True
         self.attrs['data_layout'] = 'NCHW'
@@ -108,7 +128,13 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = True
         self.attrs['data_layout'] = 'NCHW'
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
rename to python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
index 9b485d7794db2cbb538317b0d664d6ece9799a83..ef61e651b2ad9a40d941f196443d6d21dee78850 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@ SEED = 2021
                  "core is not compiled with IPU")
 class TestFunc(unittest.TestCase):
     def _test_func(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -40,22 +39,20 @@ class TestFunc(unittest.TestCase):
         c, h, w = 3, 10, 10
         np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[n, c, h, w], dtype='float32')
                 conv2d = paddle.static.nn.conv2d(
                     image, num_filters=3, filter_size=3, bias_attr=False)
 
-                # paddle.mean oshape on ipu is [bps], need another mean()
-                # paddle.mean oshape on cpu is [1]
-                # out = paddle.mean(conv2d)
                 out = conv2d
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
@@ -63,14 +60,9 @@ class TestFunc(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [out.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    num_ipus=2,
-                    is_training=False,
-                    enable_manual_shard=True,
-                    need_avg_shard=True)
-                ipu_strategy.SetPipeliningConfig(
-                    enable_pipelinin=True, batches_per_step=bps)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=False)
+                ipu_strategy.set_pipelining_config(batches_per_step=bps)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 6e58f809046000bb7a41b9875f9ebf945b86fd07..5f0eeaa2f99abcde217f4c1bdef9dd819d1e1d5e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,14 +26,14 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
     def set_atol(self):
         self.atol = 1e-3
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
         }
@@ -47,23 +41,20 @@ class TestBase(IPUOpTest):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -82,8 +73,8 @@ class TestBase(IPUOpTest):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -103,27 +94,91 @@ class TestBase(IPUOpTest):
         self.assertTrue(res0.shape == res1.shape)
 
 
-class TestCase1(TestBase):
-    def set_attrs(self):
+class TestCase2(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'float32'
+
+
+class TestCase3(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'int32'
+
+
+class TestCase4(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'float32'
+
+
+class TestCase5(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'int32'
+
+
+class TestCase6(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
+        }
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
 
 @unittest.skip('float64 is not supported')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float64'
 
 
 @unittest.skip('skip float16 to float32')
 class TestCase3(TestBase):
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
@@ -133,13 +188,13 @@ class TestCase4(TestBase):
     def set_atol(self):
         self.atol = 1
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.randint(
                 low=1, high=100, size=[1, 3, 3, 3]).astype('int32'),
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'int8'
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index 094b19ce99da9c73c188f000dc7080504bc9ff3e..c5a8090283940e1e3aa034e47c7c11e6ed3dbfd3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,95 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+        data2 = np.random.uniform(size=[1, 3, 10, 10])
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        self.feed_fp32 = {
+            'x': data1.astype(np.float32),
+            'y': data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data1.astype(np.float16),
+            'y': data2.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.concat([x, y], **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 1}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index f28733de6b1a12a8aac362e30c8478a145520506..ade54fda86929f0954c973463b91ed7981e385c3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,20 +26,30 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['num_filters'] = 3
         self.attrs['filter_size'] = 3
@@ -54,104 +59,112 @@ class TestBase(IPUOpTest):
         self.attrs['groups'] = 1
         self.attrs['data_format'] = 'NCHW'
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.conv2d(image, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['num_filters'] = 1
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['filter_size'] = [3, 3]
 
 
 class TestCase2_1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['filter_size'] = [3, 2]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['stride'] = [2, 3]
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['dilation'] = [2, 2]
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['groups'] = 3
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = 2
 
 
 class TestCase7(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = [2, 3]
 
 
 class TestCase8(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = [1, 2, 2, 3]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index 3987c6cd5b386ae22a2fcac1a985e5e915a3e5ae..3a21f0cb0079c11b782e3fc83d82ced0c411114a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,54 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 7]).astype('float32'),
-            "label": np.arange(3).reshape([3]).astype(np.int64),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 7])
+        label = np.arange(3).reshape([3, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {'soft_label': False, }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def np_nll_loss(self):
+        tmp = -np.log(self.feed_fp32['x'])
+        label = self.feed_fp32['label']
+        indice = [range(label.shape[0]), label.flatten()]
+        self.np_ref = tmp[indice]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
 
-                # [warning] Copying (host) tensor input/1 from INT64 to INT32.
-                #  Will only warn once
-                if run_ipu:
+                if exec_mode != ExecutionMode.CPU_FP32:
                     label = paddle.static.data(
                         name=self.feed_list[1],
                         shape=self.feed_shape[1],
@@ -80,52 +84,78 @@ class TestBase(IPUOpTest):
                         shape=self.feed_shape[1],
                         dtype='int64')
 
-                out = fluid.layers.cross_entropy(
+                out = paddle.fluid.layers.cross_entropy(
                     input=x, label=label, **self.attrs)
+
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed['label'] = feed['label'].astype(np.int32)
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(res0.shape == res1.shape)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+        self.np_nll_loss()
+
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'soft_label': False,
             'ignore_index': 1,
         }
 
 
-@unittest.skip("soft_label=True id not supported")
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[30, 70])
+        label = np.arange(30).reshape([30, 1])
+
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+@unittest.skip("soft_label=True is not supported")
+class TestCase3(TestBase):
+    def set_op_attrs(self):
         self.attrs = {'soft_label': True, }
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f1d86daf00573f62d2dba9cf5a68c5d21c06db3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -0,0 +1,123 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    # popart unsupport fp16 cumsum
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 128])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="float32")
+
+                out = paddle.fluid.layers.cumsum(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": True, "reverse": False}
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": False, "reverse": True}
+
+
+class TestCase3(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": True, "reverse": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index 8b1560edfd81de65f495a1bd0609bfa09c5f7810..e34da7f70167a601e4cdc0ed48beebb261935d8f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
             "dropout_implementation": "downgrade_in_infer"
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 dropout = paddle.fluid.layers.dropout(x, **self.attrs)
                 out = paddle.fluid.layers.elementwise_add(dropout, dropout)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
@@ -115,7 +117,7 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.0,
             "is_test": False,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index 07b06d77c90ffb41d3a221c94d52bdc2c7d1a3a5..a9d6d2308326eaab8bc9fd15f3799c6f8cdd505d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,101 +27,136 @@ class TestMul(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        if IPUOpTest.use_ipumodel():
+            return False
+        else:
+            return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mul
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = self.op(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
     def test_case0(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2, 3, 4, 5))
+
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.attrs = {}
         self.set_feed_attr()
         self.run_test_base()
 
     def test_case1(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(3, 4)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(3, 4))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 1}
         self.run_test_base()
 
     def test_case2(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(5))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": -1}
         self.run_test_base()
 
     def test_case3(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 0}
@@ -134,37 +164,43 @@ class TestMul(IPUOpTest):
 
 
 class TestAdd(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_add
 
 
 class TestSub(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_sub
 
 
 class TestDiv(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_div
 
 
 class TestMin(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_min
 
 
 class TestMax(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_max
 
 
 class TestPow(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_pow
 
 
 class TestMod(TestMul):
-    def init_op(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mod
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index c319894bfae250789bf9931343ac29106629a148..5b18c73851324fc66666a885a25797dd15731d76 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,94 +26,106 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.zeros([1, 10]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                # XX
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.equal(x, y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.ones([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.arange(0, 10).reshape([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.arange(0, 10).reshape([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 5b7ea61568ecd5772d574dd3cc63fac74535c903..966dfdef87b54cd1e37bd9405c6b354a6f264363 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,142 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 3, 1]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"expand_times": [1, 2, 2]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
+
                 out = paddle.fluid.layers.expand(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 2]).astype('float32')}
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                expand_times = fluid.layers.fill_constant(
+                    dtype="float32")
+
+                expand_times = paddle.fluid.layers.fill_constant(
                     shape=[len(self.feed_shape[0])], dtype="int32", value=2)
                 out = paddle.fluid.layers.expand(
                     x, expand_times=expand_times, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b855a5a7a4297275171f288ddc68a61a561745
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 0.3, 'dtype': 'float32'}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                x_fill = paddle.full_like(x, **self.attrs)
+                out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 3, 'dtype': 'int32'}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index c62e0c08f9c79c2ee1217b2b74df84c26cf30f1f..3a1c202bf1133aecbe1bf2954b3bf8d5916776ab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,21 +26,23 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
@@ -54,33 +50,34 @@ class TestBase(IPUOpTest):
             'value': 0.3,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.fluid.layers.fill_constant(**self.attrs)
                 out = paddle.fluid.layers.elementwise_add(x, x)
-
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -89,19 +86,18 @@ class TestBase(IPUOpTest):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0cafc66805e75d239e149c2595197d004652c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -0,0 +1,118 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 1
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.flatten(x=x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
+
+        self.check(output_dict, check_shape=True)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 0
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd29ff705b88f012d434d4578816a112dcbd81a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import shutil
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['path'] = 'model'
+        self.attrs['model_name'] = 'test'
+
+    def _test_save(self):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
+        self.full_name = '/'.join(
+            [self.attrs['path'], self.attrs['model_name']])
+
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
+                with paddle.static.program_guard(main_prog, startup_prog):
+                    x = paddle.static.data(
+                        name=self.feed_list[0],
+                        shape=self.feed_shape[0],
+                        dtype='float32')
+
+                    scale = paddle.fluid.layers.scale(
+                        x, scale=1.0, bias=0.0, bias_after_scale=True)
+                    conv = paddle.static.nn.conv2d(
+                        scale,
+                        num_filters=3,
+                        filter_size=3,
+                        bias_attr=False,
+                        name='conv2d')
+                    loss = paddle.mean(conv)
+
+                    if self.attrs['is_training']:
+                        if self.attrs['opt_type'] == 'sgd':
+                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
+                            sgd.minimize(loss)
+                        elif self.attrs['opt_type'] == 'adam':
+                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                            adam.minimize(loss)
+                        elif self.attrs['opt_type'] == 'lamb':
+                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
+                            lamb.minimize(loss)
+
+                fetch_list = [loss.name]
+
+                place = paddle.IPUPlace()
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(
+                        self.feed_list, fetch_list)
+
+                for _ in range(self.attrs['steps']):
+                    exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list)
+
+                paddle.static.save_inference_model(
+                    self.full_name, x, loss, exe, program=program.org_program)
+
+    def _test_load(self, run_ipu):
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+
+        [inference_program, feed_target_names, fetch_targets] = (
+            paddle.static.load_inference_model(self.full_name, exe))
+
+        if run_ipu:
+            feed_list = feed_target_names
+            fetch_list = [fetch_targets[0].name]
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=False)
+            ipu_strategy.set_precision_config(enable_fp16=True)
+            program = paddle.static.IpuCompiledProgram(
+                inference_program,
+                ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+        else:
+            program = inference_program
+
+        feed = self.feed_fp16 if run_ipu else self.feed_fp32
+        result = []
+        for i in range(10):
+            feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype)
+            out = exe.run(program, feed=feed, fetch_list=[fetch_targets])
+            result.append(out)
+
+        return np.array(result)
+
+    def test_base(self):
+        self._test_save()
+        cpu_res = self._test_load(False)
+        ipu_res = self._test_load(True).astype(np.float32)
+
+        self.assertTrue(
+            np.allclose(
+                cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16))
+
+        shutil.rmtree(self.attrs['path'], True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index d5be8ae0cf77526a6aefa1fd060a510689c43cc0..01a56fd14be04b280eef86c3088b1b79f180815a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[10, 20]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[10, 20])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
+
                 out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[100]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[100])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index ca8c0935d782cc275838871e2c73a3eba9454e5b..602289f3f19041151e5b919f402a986c77207bd6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_atol(self):
-        self.atol = 1e-3
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"approximate": False}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.gelu(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
-@unittest.skip('approximate=True is not supported')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 2e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {"approximate": True}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a37dcb3d51475e896d6a8f1e9458b31ece85b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -0,0 +1,140 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.randn(3, 4, 5)
+        y = np.random.randn(3, 4, 5)
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.greater_than(x, y, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.zeros([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index eb644c2c6670f5beedbb3ad0b1868bc8a0f9434e..102e764cb2f172b369c064fcffabc83aa9497e21 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,43 +26,49 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 8, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 8, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 8,
             "epsilon": 1e-05,
             "data_layout": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -78,62 +78,68 @@ class TestBase(IPUOpTest):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.group_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.group_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -147,11 +153,15 @@ class TestTrainCase1(TestBase):
         self.epoch = 10
 
 
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 7e-4
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -163,7 +173,5 @@ class TestTrainCase2(TestBase):
         self.epoch = 10
 
 
-# not support `group_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
similarity index 78%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
rename to python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
index 0a331d804545d49eeaffdf0c8054db89041f2c29..33a63a80e3bc0dc6aed50f39029b094461562d62 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
@@ -12,59 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed = {"in_0": data.astype(np.float32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
     def _test_save(self):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
         self.full_name = '/'.join(
-            [self.attrs['path'], self.attrs['model_name']])
+            [self.attrs['path'].name, self.attrs['model_name']])
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -88,16 +88,16 @@ class TestBase(IPUOpTest):
                         elif self.attrs['opt_type'] == 'lamb':
                             lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
                             lamb.minimize(loss)
-                    fetch_list = [loss.name]
+                fetch_list = [loss.name]
 
                 place = paddle.IPUPlace()
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -125,8 +125,8 @@ class TestBase(IPUOpTest):
             feed_list = feed_target_names
             fetch_list = [fetch_targets[0].name]
             ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(is_training=False)
-            program = compiler.IPUCompiledProgram(
+            ipu_strategy.set_graph_config(is_training=False)
+            program = paddle.static.IpuCompiledProgram(
                 inference_program,
                 ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
         else:
@@ -134,7 +134,7 @@ class TestBase(IPUOpTest):
 
         tmp = exe.run(program, feed=self.feed, fetch_list=[fetch_targets])
 
-        return tmp
+        return np.array(tmp)
 
     def test_base(self):
         self._test_save()
@@ -142,27 +142,26 @@ class TestBase(IPUOpTest):
         ipu_res = self._test_load(True)
 
         self.assertTrue(np.allclose(cpu_res, ipu_res, atol=self.atol))
-
-        shutil.rmtree(self.attrs['path'], True)
+        self.attrs['path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index ee9cd875cf29884da58dc7f0488e7f9bfc50d4e5..ed8f3950ace82c13cd41a62d5a6e13055149187a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,39 +26,45 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"epsilon": 1e-05}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -74,58 +74,64 @@ class TestBase(IPUOpTest):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.instance_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.instance_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res)
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
@@ -134,7 +140,5 @@ class TestTrainCase1(TestBase):
         self.epoch = 10
 
 
-# not support `instance_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
deleted file mode 100644
index beab68553d723c5c2277861a74a8c7afaa4d2c38..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            c = b + 1  # scale, ipu_stage : 1
-            with paddle.fluid.ipu_shard(ipu_stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.fluid.ipu_shard(ipu_stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.fluid.ipu_shard(ipu_stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.fluid.ipu_shard(ipu_stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.fluid.ipu_shard(ipu_stage=2):
-            g = f - 1  # scale, ipu_stage : 2
-
-        h = g + 1  # scale, ipu_stage : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_stage"):
-                ipu_index_list.append(op.desc.attr("ipu_stage"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
deleted file mode 100644
index 48ab046deb37038c07a8d02b6ca6a8638d58fa71..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-sys.path.append("..")
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuPlace(unittest.TestCase):
-    def test_ipu_place(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            place = paddle.IPUPlace()
-            p = fluid.core.Place()
-            p.set_place(place)
-            self.assertTrue(p.is_ipu_place())
-
-    def test_ipu_set_device(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            paddle.set_device('ipu')
-            device = paddle.get_device()
-            self.assertTrue(device == "ipus:{{0-{}}}".format(num_devices - 1))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
deleted file mode 100644
index 368556d8b2f2d485fe09a6503668559c4e73911e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            c = b + 1  # scale, ipu_index : 1
-            with paddle.fluid.ipu_shard(ipu_index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.fluid.ipu_shard(ipu_index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.fluid.ipu_shard(ipu_index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.fluid.ipu_shard(ipu_index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
-
-        with paddle.fluid.ipu_shard(ipu_index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_index"):
-                ipu_index_list.append(op.desc.attr("ipu_index"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..026b19eccf18721ba7be062d3fd4516deefb2aa0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -0,0 +1,112 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuShard(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_index : no
+
+        with paddle.static.ipu_shard_guard(index=1):
+            c = b + 1  # scale, ipu_index : 1
+            with paddle.static.ipu_shard_guard(index=2):
+                d = c * 2  # scale, ipu_index : 2
+            with paddle.static.ipu_shard_guard(index=3):
+                e = d + 3  # scale, ipu_index : 3
+                with paddle.static.ipu_shard_guard(index=1):
+                    e = e + 3  # scale, ipu_index : 1
+                    with paddle.static.ipu_shard_guard(index=2):
+                        e = e + 3  # scale, ipu_index : 2
+
+        with paddle.static.ipu_shard_guard(index=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
+
+        with paddle.static.ipu_shard_guard(index=2):
+            g = f - 1  # scale, ipu_index : 2
+
+        h = g + 1  # scale, ipu_index : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuPipeline(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_stage : no
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            c = b + 1  # scale, ipu_stage : 1
+            with paddle.static.ipu_shard_guard(stage=2):
+                d = c * 2  # scale, ipu_stage : 2
+            with paddle.static.ipu_shard_guard(stage=3):
+                e = d + 3  # scale, ipu_stage : 3
+                with paddle.static.ipu_shard_guard(stage=1):
+                    e = e + 3  # scale, ipu_stage : 1
+                    with paddle.static.ipu_shard_guard(stage=2):
+                        e = e + 3  # scale, ipu_stage : 2
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
+
+        with paddle.static.ipu_shard_guard(stage=2):
+            g = f - 1  # scale, ipu_stage : 2
+
+        h = g + 1  # scale, ipu_stage : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_stage"):
+                ipu_index_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
deleted file mode 100644
index afeec9ee1b6fa75961aa76bd2f2c2f6701e200b5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestConvNet(unittest.TestCase):
-    def test_training(self):
-        ipu_strategy = paddle.static.IpuStrategy()
-
-        assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1"
-        assert ipu_strategy.is_training == True, "Default is_training is True"
-        assert ipu_strategy.enable_pipelining == False, \
-            "Default enable_pipelining is False"
-        assert ipu_strategy.enable_manual_shard == False, \
-            "Default enable_manual_shard is False"
-
-        ipu_strategy.SetGraphConfig(
-            num_ipus=2, is_training=False, enable_manual_shard=True)
-        ipu_strategy.SetPipeliningConfig(enable_pipelining=True)
-        assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed"
-
-        assert ipu_strategy.is_training == False, "Set is_training Failed"
-
-        assert ipu_strategy.enable_pipelining == True, \
-            "Set enable_pipelining Failed"
-
-        assert ipu_strategy.enable_manual_shard == True, \
-            "Set enable_manual_shard Failed"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f120f5594914e8e555bd0b024fa55224e2781f9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -0,0 +1,72 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuStrategy(unittest.TestCase):
+    def test_set_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        for option_name in all_option_names:
+            option = ipu_strategy._ipu_strategy.get_option(option_name)
+            option_type = option['type']
+            option_value = option['value']
+            if option_type in ['double']:
+                set_value = option_value + 0.5
+            elif option_type == 'uint64':
+                set_value = option_value + 1
+            elif option_type == 'bool':
+                set_value = not option_value
+            else:
+                continue
+            ipu_strategy.set_options({option_name: set_value})
+            new_value = ipu_strategy.get_option(option_name)
+            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+    def test_set_string_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {
+            'cache_path': 'paddle_cache',
+            'log_dir': 'paddle_log',
+            'partials_type_matmuls': 'half',
+            'partials_type_matmuls': 'float',
+        }
+        ipu_strategy.set_options(options)
+        for k, v in options.items():
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+    def test_set_other_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {}
+        options['dot_checks'] = ['0', '1', '2', '3']
+        options['engine_options'] = {
+            'debug.allowOutOfMemory': 'true',
+            'autoReport.directory': 'path',
+            'autoReport.all': 'true'
+        }
+        for k, v in options.items():
+            ipu_strategy.set_options({k: v})
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index 196f94b68f94a08f1b08871c805a9d7ceee14ffa..a52946bba1567b168ff1bbccbefd7bc724241eba 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,52 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 1,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -80,33 +82,38 @@ class TestBase(IPUOpTest):
                     out = paddle.fluid.layers.nn.layer_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
                 else:
-                    # scale = True
-                    # bias = True
                     scale = self.attrs['scale']
                     bias = self.attrs['shift']
                     out = paddle.fluid.layers.nn.layer_norm(
                         x, param_attr=scale, bias_attr=bias, **self.attrs)
+                loss = paddle.mean(out)
 
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
+                fetch_list = [loss.name]
 
-            if run_ipu:
+                if self.is_training:
+                    optimizer = None
+                    if self.optimizer == 'sgd':
+                        optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
+                    elif self.optimizer == 'adam':
+                        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+                    elif self.optimizer == 'lamb':
+                        optimizer = paddle.optimizer.Lamb(
+                            learning_rate=1e-2, lamb_weight_decay=0.0)
+                    if optimizer is not None:
+                        optimizer.minimize(loss)
+
+            if exec_mode:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -116,12 +123,14 @@ class TestBase(IPUOpTest):
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=self.feed_fp32,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program,
+                                 feed=self.feed_fp32,
+                                 fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
@@ -137,7 +146,7 @@ class TestBase(IPUOpTest):
 
 @unittest.skip('raise error')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": False,
             "shift": True,
@@ -148,7 +157,7 @@ class TestCase1(TestBase):
 
 @unittest.skip('raise error')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": False,
@@ -158,18 +167,28 @@ class TestCase2(TestBase):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
 
 class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 1,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'sgd'
+
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 1e-6
 
     def set_training(self):
         self.is_training = True
@@ -178,15 +197,34 @@ class TestTrainCase1(TestBase):
 
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 5e-4
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
-            "epsilon": 1e-05,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'adam'
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+class TestTrainCase3(TestBase):
+    def set_atol(self):
+        self.atol = 5e-3
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 2,
+            "epsilon": 1e-05
         }
+        self.optimizer = 'lamb'
 
     def set_training(self):
         self.is_training = True
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index dc3cab6ac5e114f7083937687c1dfedf0ebd1c44..fad7516e442a728be554cd5f5059a5229c357014 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -16,15 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
 import paddle.nn.functional as F
-
-paddle.enable_static()
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,72 +27,81 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = F.log_softmax(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f8472890d03e69042d8e8d6db33417c9a80dea3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -0,0 +1,97 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 20, 30528])
+        self.feed = {"in_0": data.astype('bool')}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="bool")
+
+                out = paddle.fluid.layers.logical_not(x)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).astype(np.int32)
+
+        self.check(output_dict, check_shape=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 31b0c99603c3f707328eef5a3713bcd9b882b948..4a877ddce4e3ce1af9720f954b46bd414783fb03 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,16 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": data.astype(np.int64)}
+        self.feed_ipu = {"x": data.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "size": [128, 16],
             "is_sparse": False,
@@ -50,33 +53,20 @@ class TestBase(IPUOpTest):
             "dtype": 'float32'
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        if run_ipu:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int32)
-            }
-        else:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int64)
-            }
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        self.set_feed_attr()
-
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='int64')
+
                 out = paddle.fluid.layers.embedding(x, **self.attrs)
 
                 if self.is_training:
@@ -87,47 +77,61 @@ class TestBase(IPUOpTest):
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
     def set_training(self):
         self.is_training = True
         self.epoch = 10
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..da8048fb3205e5f1c8c4c0a4e32fb5f0ff264554
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -0,0 +1,141 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {"x": x.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "num_embeddings": 128,
+            "embedding_dim": 16,
+            "sparse": False,
+            "padding_idx": -1,
+            "weight_attr": None
+        }
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                embedding = paddle.nn.Embedding(**self.attrs)
+                out = embedding(x)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
rename to python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 38b91785aeec8c061a5d4f6203363645569c2824..58f018e2ae649accd27f7c139bc4eb78d4d70d9a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -19,7 +19,7 @@ import unittest
 import sys
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 from paddle.optimizer.lr import LRScheduler
 
 paddle.enable_static()
@@ -71,8 +71,8 @@ class TestConvNet(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index c6702b92ab969ec7b1b97c44bc9245c6df41d83a..6929ded6ebf90ea9081c12ed9af97cb6d7630cab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,93 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[20, 30])
+        y = np.random.uniform(size=[30, 20])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": False,
             "transpose_y": False,
             "alpha": 1.0,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.matmul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -119,55 +121,64 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
             "alpha": 3.14,
         }
 
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
 
 class TestCase3(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[5, 4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 3, 2])
+        y = np.random.uniform(size=[5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 3, 2])
+        y = np.random.uniform(size=[4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase5(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase6(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 @unittest.skip("not supported")
 class TestCase6_2(TestCase6):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -176,27 +187,36 @@ class TestCase6_2(TestCase6):
 
 
 class TestCase7(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 1]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 12, 128, 64])
+        y = np.random.uniform(size=[1, 12, 128, 64])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125}
+
+
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 @unittest.skip("not supported")
-class TestCase7_2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[2]).astype('float32'),
-        }
-        # equal to
-        # self.feed = {
-        #     "x": np.random.uniform(size=[3, 1]).astype('float32'),
-        #     "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        # }
+class TestCase8_2(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[2])
 
-    def set_attrs(self):
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -205,12 +225,12 @@ class TestCase7_2(TestBase):
 
 
 @unittest.skip("dim > 4 is not supported")
-class TestCase8(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
-        }
+class TestCase9(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[6, 5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1c115403adf2037deb8c8e2b3a87f341d1062e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -0,0 +1,186 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.matmul(x, y, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_x": True,
+            "transpose_y": True,
+        }
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 2, 3])
+        y = np.random.uniform(size=[5, 4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase4(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase6(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("not supported")
+class TestCase6_2(TestCase6):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": True, "transpose_y": True}
+
+
+class TestCase7(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("dim > 4 is not supported")
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
+            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index f04d712755deadd44158431569430909f19a093e..b9dd7358b79550ffdc059dde189aded5d427449a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,97 +26,79 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                out = paddle.mean(x, **self.attrs)
 
-                fetch_list = [out.name]
+                out = paddle.fluid.layers.mean(x)
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-class TestCase1(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 1
-        self.attrs['keepdim'] = False
-
-
-class TestCase2(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = False
-
-
-class TestCase3(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = True
-
-
-class TestCase4(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = True
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
similarity index 86%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
rename to python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
index e1ed7603ed6272ba91cf485d91f512f07b72a258..7e70239964002184819fcaabc89e32704547b22c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@ SEED = 2021
                  "core is not compiled with IPU")
 class TestCastNet(unittest.TestCase):
     def _test(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -37,14 +36,14 @@ class TestCastNet(unittest.TestCase):
 
         np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[1, 3, 10, 10], dtype='float32')
-                with fluid.ipu_shard(ipu_index=0):
+                with paddle.static.ipu_shard_guard(index=0):
                     conv1 = paddle.static.nn.conv2d(
                         image, num_filters=3, filter_size=3, bias_attr=False)
-                with fluid.ipu_shard(ipu_index=1):
+                with paddle.static.ipu_shard_guard(index=1):
                     conv2 = paddle.static.nn.conv2d(
                         conv1, num_filters=3, filter_size=3, bias_attr=False)
                     loss = paddle.mean(conv2)
@@ -60,9 +59,10 @@ class TestCastNet(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     num_ipus=2, is_training=False, enable_manual_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_pipelining_config(enable_pipelining=False)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 78a2589d9aca59ec72d22aa6fa35f9f934f94caf..7a9135626df7902eb8ddf650f3911cd854b8b6e2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,90 +26,98 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 1,
             "y_num_col_dims": 1,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.mul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 2,
             "y_num_col_dims": 1,
@@ -123,13 +125,13 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 4, 2, 9]).astype('float32'),
-            "y": np.random.uniform(size=[3, 6, 1, 2, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 9])
+        y = np.random.uniform(size=[3, 6, 1, 2, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cc10da3d73444f329bbcbf53694c9b4ff93fdfc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -0,0 +1,165 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+
+    def set_data_feed(self):
+        self.feed = {
+            "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.0,
+            "loss_scaling": 1.0,
+        }
+
+    def _test_optimizer(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        np.random.seed(self.SEED)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(
+                    name='image', shape=[1, 3, 10, 10], dtype='float32')
+                conv1 = paddle.static.nn.conv2d(
+                    image, num_filters=3, filter_size=3, bias_attr=False)
+                loss = paddle.mean(conv1)
+
+                weight_decay = self.attrs['weight_decay']
+                opt = paddle.optimizer.SGD(learning_rate=1e-1,
+                                           weight_decay=weight_decay)
+                if self.attrs['optimizer'] == 'adam':
+                    opt = paddle.optimizer.Adam(
+                        learning_rate=1e-1, weight_decay=weight_decay)
+                elif self.attrs['optimizer'] == 'lamb':
+
+                    opt = paddle.optimizer.Lamb(
+                        learning_rate=1e-1, lamb_weight_decay=weight_decay)
+                opt.minimize(loss)
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = [image.name]
+                fetch_list = [loss.name]
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
+                                                                  fetch_list)
+            else:
+                program = main_prog
+
+            result = []
+            for epoch in range(100):
+                loss_res = exe.run(program, feed=self.feed, fetch_list=[loss])
+                result.append(loss_res)
+
+            return np.array(result)
+
+    def test(self):
+        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
+        ipu_loss = self._test_optimizer(True).flatten()
+        cpu_loss = self._test_optimizer(False).flatten()
+
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol))
+
+
+@unittest.skip('do not support L2 regularization')
+class TestSGD(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.1,
+            "loss_scaling": 2.0,
+        }
+
+
+@unittest.skip('do not support L2 regularization')
+class TestAdamCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.1,
+            "loss_scaling": 3.0,
+        }
+
+
+class TestAdamCase2(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLambCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "loss_scaling": 5.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLamb(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index e81591ad68368033bc9b5223753203ae3126c005..4288b82832edef7068b1b692d55ed85789e9c0d1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'avg',
@@ -60,53 +56,59 @@ class TestBase(IPUOpTest):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index a7c45c6686f10e3c13462d49ab8e34fe72c4f03a..911a163b8aa9c91e159b5fc8e15041d564c648cc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'max',
@@ -60,120 +56,126 @@ class TestBase(IPUOpTest):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = 3
 
 
 class TestCase1_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = [3, 1]
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = 2
 
 
 class TestCase2_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = [2, 1]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1]
 
 
 class TestCase3_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1, 2, 2]
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'VALID'
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'SAME'
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['global_pooling'] = True
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['ceil_mode'] = True
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['exclusive'] = False
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 5059de7ba77b1baf65c511da11735a73a87aa1e8..b3562d722c4e6aaf5578e321bbb225c5e4a247b0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,124 +26,146 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"factor": 2.0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.array([2.0]).astype('float32'),
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 2, 2])
+        data2 = np.array([2.0])
+
+        self.feed_fp32 = {
+            "x": data1.astype(np.float32),
+            "y": data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": data1.astype(np.float16),
+            "y": data2.astype(np.float16)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 factor = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9454e5945f7d61c6b4fc92ffaccd80895180e04
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -0,0 +1,143 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype=self.feed_dtype[0])
+                out = paddle.fluid.layers.conv2d(
+                    x, num_filters=3, filter_size=3)
+                out = paddle.fluid.layers.Print(out, **self.attrs)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=self.feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1.flatten(), atol=self.atol))
+
+        self.assertTrue(res0.shape == res1.shape)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"message": "input_data"}
+
+
+class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        # "forward" : print forward
+        # "backward" : print forward and backward
+        # "both": print forward and backward
+        self.attrs = {"message": "input_data2", "print_phase": "both"}
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 2
+
+
+@unittest.skip("attrs are not supported")
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "first_n": 10,
+            "summarize": 10,
+            "print_tensor_name": True,
+            "print_tensor_type": True,
+            "print_tensor_shape": True,
+            "print_tensor_layout": True,
+            "print_tensor_lod": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index ac8ad08e8b28c00555c5c78f5b8a834b0024acc7..929ee51b650946dca45fd2b8302ba10c61a99ceb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,137 @@ class TestMean(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_mean
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = self.op(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-    def set_feed0(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 4]).astype(np.float32)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+    def set_data_feed0(self):
+        data = np.random.uniform(size=[2, 4])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_feed1(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 2, 2]).astype(np.float32)
+    def set_data_feed1(self):
+        data = np.random.uniform(size=[2, 2, 2])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_attr0(self):
+    def set_op_attr0(self):
         self.attrs = {}
         self.attrs['dim'] = None
         self.attrs['keep_dim'] = False
 
     def test_case0(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.run_test_base()
 
     def test_case1(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 0
         self.run_test_base()
 
     def test_case2(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = -1
         self.run_test_base()
 
     def test_case3(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 1
         self.run_test_base()
 
     def test_case4(self):
-        self.set_feed0()
+        self.set_data_feed0()
         self.attrs = {}
         self.attrs['dim'] = 1
         self.attrs['keep_dim'] = True
         self.run_test_base()
 
     def test_case5(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [1, 2]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case6(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case7(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = True
@@ -158,22 +164,22 @@ class TestMean(IPUOpTest):
 
 
 class TestMax(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_max
 
 
 class TestMin(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_min
 
 
 class TestProd(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_prod
 
 
 class TestSum(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_sum
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index f312b7b69ad79b721fd768f6762a48ca68793d6d..9ddf5c7537fdcd93d71c9130ca92d3bfb4caea52 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 add = paddle.fluid.layers.elementwise_add(x, x)
                 out = paddle.fluid.layers.reshape(add, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 5163838bc0cd633f69e2e446294250686e4fe04f..119771931701c9d790eeceec9604bec756e56537 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,82 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([2, 4, 6])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [6, 8]
         self.attrs['inplace'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.reshape(x=x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [2, 3, -1, 2]
         self.attrs['inplace'] = False
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [-1, 0, 3, 2]
         self.attrs['inplace'] = False
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
similarity index 58%
rename from python/paddle/fluid/tests/unittests/ipu/test_save_load.py
rename to python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 24bb8e111842cb93ed35cdc796868c5a911ee36f..3a694873062080d49237299afe6a4171dc4fa242 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -12,55 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
     def _test_base(self, save_otherwise_load):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -91,12 +88,17 @@ class TestBase(IPUOpTest):
                 exe.run(startup_prog)
 
                 if not save_otherwise_load:
-                    paddle.static.load(main_prog, "model/model")
+                    paddle.static.load(main_prog, self.attrs['model_path'].name)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_precision_config(
+                    enable_fp16=self.attrs['enable_fp16'])
+                ipu_strategy.set_options({
+                    'save_per_n_step': self.attrs['save_at_step']
+                })
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -104,16 +106,17 @@ class TestBase(IPUOpTest):
                 run_steps = self.attrs['steps'] if save_otherwise_load \
                     else self.attrs['steps'] - self.attrs['save_at_step']
 
+                feed = self.feed_fp16 if self.attrs[
+                    'enable_fp16'] else self.feed_fp32
                 for i in range(run_steps):
-                    tmp = exe.run(program,
-                                  feed=self.feed,
-                                  fetch_list=fetch_list)
+                    tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
                     # currently, we update opt state every sess.run,
                     # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
-                        paddle.static.save(main_prog, "model/model")
+                        paddle.static.save(main_prog,
+                                           self.attrs['model_path'].name)
 
                     if save_otherwise_load and i >= self.attrs['save_at_step']:
                         result.append(tmp)
@@ -129,25 +132,65 @@ class TestBase(IPUOpTest):
         self.assertTrue(
             np.allclose(
                 res0.flatten(), res1.flatten(), atol=self.atol))
-        shutil.rmtree("model", True)
+        self.attrs['model_path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestSGDFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestAdamFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestLambFP16(TestBase):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 6ad2a89a738b7090ce082a637c3d531cf4566f9a..49714eba8d4d1ef926847e854617e543491a46fe 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 5.0,
             "bias": 0.0,
@@ -114,7 +116,7 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.5,
@@ -123,7 +125,16 @@ class TestCase2(TestBase):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": 5.0,
+            "bias": 0.7,
+            "bias_after_scale": True,
+        }
+
+
+class TestCase4(TestBase):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
@@ -131,59 +142,66 @@ class TestCase3(TestBase):
         }
 
 
-class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 3, 10, 10]).astype('float32'),
-            "y": np.array([3.0]).astype('float32'),
-        }
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 3, 10, 10])
+        y = np.array([3.0])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 93945b98ef0a26b35b28e1cf9d2bb5e88d21f308..9a18922f35331bd23ab9cd40ff6a4dea1f446ce5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,36 +26,46 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([-1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=[2, 3, 128, 128]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 128, 128])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 conv2 = paddle.static.nn.conv2d(
@@ -70,36 +75,45 @@ class TestBase(IPUOpTest):
                 conv4 = paddle.static.nn.conv2d(
                     conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+            fetch_list = [conv4.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    batch_size=2, is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                # set batch size
+                ipu_strategy.micro_batch_size = 2
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
deleted file mode 100644
index df0e2a040bd3e55aac20e383c7aa2ff50c567de4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestSGD(unittest.TestCase):
-    def _test_sgd(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                sgd = paddle.optimizer.SGD(learning_rate=1e-1)
-                sgd.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                loss_res = exe.run(program,
-                                   feed={"image": np_image},
-                                   fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
-
-    def test_sgd(self):
-        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test_sgd(True).flatten()
-        cpu_loss = self._test_sgd(False).flatten()
-
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 3bdfeabce6592cd25067adfdabe8b7c74a6848c7..8881f018de3b538bd350167793133cd8460aa37b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,78 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[4, 5, 6]).astype('float32'), }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[4, 5, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1, 2],
             "starts": [-3, 0, 2],
             "ends": [3, 2, 4],
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.slice(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1],
             "starts": [0, 0],
@@ -113,38 +117,45 @@ class TestCase1(TestBase):
 
 @unittest.skip('dynamic graph is not support on IPU')
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 5, 6]).astype('float32'),
-            "starts": np.array([0, 0, 2]).astype('int32'),
-            "ends": np.array([3, 2, 4]).astype('int32'),
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 5, 6])
+        s = np.array([0, 0, 2])
+        e = np.array([3, 2, 4])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0, 1, 2]}
 
     def _test_base(self, run_ipu=True):
         scope = fluid.core.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
         with fluid.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 starts = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
                 ends = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='int32')
                 out = paddle.fluid.layers.slice(
                     x, starts=starts, ends=ends, **self.attrs)
 
@@ -160,8 +171,8 @@ class TestCase2(TestBase):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -170,6 +181,9 @@ class TestCase2(TestBase):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
+    def test_base(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index a4a4b83baf35e558ff1f8b0982ffc44287919dfc..25201959cecbc4941b0481bffd7e4dfee93ef6ce 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 20])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.softmax(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..59af3a3d6ac17a87a2af413d0f5a283c69d8f8e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -0,0 +1,113 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+
+        self.feed_fp32 = {'x': data1.astype(np.float32)}
+        self.feed_fp16 = {'x': data1.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.split(x, **self.attrs)
+
+                fetch_list = [fetch.name for fetch in out]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled
+                ) or mode == ExecutionMode.IPU_POPART_FP16:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [2, 8], "axis": 2}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index ccd2796590838faa8980ef7d214f73a944c9220d..bdc8fb32c84722815d53f113e8a9be436e7e8b0f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,81 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 1, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 1, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.squeeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
-                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": []}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [-2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 3d5de11b5e213e3fe68561d60c1c0dbcb6fbcbf1..c807ab9aab65e4662c149a72a0cb62349c48826e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,102 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 2]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2])
+        y = np.random.uniform(size=[1, 2])
+        z = np.random.uniform(size=[1, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": z.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": z.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 003350cd7a01e284d28ff8904a38ab3755e07642..12351cb63d6c8b5b95ae2f26de2ea17f68759a7b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,131 +26,154 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 @unittest.skip('')
 class TestCase1(TestBase):
     def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        z = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": y.astype(np.float16)
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index 9915a7a1fd89f91f71814ebbe577abcf9327cb37..ef75aee78049b511f796317954525705c4c025f9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -16,130 +16,125 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestTopKOp(IPUOpTest):
     def setUp(self):
-        self.set_ops()
         self.set_atol()
         self.set_training()
-        self.k = 3
-        self.use_K_as_const_variable = False
-
-        self.set_feed()
-        self.set_attrs()
-
-    def set_ops(self):
-        self.ops = [
-            paddle.fluid.layers.topk,
-            paddle.topk  # use top_k_v2 implementation
-        ]
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([3, 5])
-
-        self.feed = {}
-        self.feed_list = []
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-        self.feed_list.append("in_0")
-        if self.use_K_as_const_variable:
-            # self.feed["in_1"] = np.array([self.k]).astype("int32")
-            # self.feed_list.append("in_1")
-            pass
-        print("[TestTopKop] feed data:\n%s" % self.feed["in_0"])
-
-    def set_attrs(self):
-        self.attrs = {
-            # "axis": -1,
-            # "sorted": True
-        }
-        if not self.use_K_as_const_variable:
-            self.attrs["k"] = self.k
-
-    def _test_base(self, run_ipu=True, op=None, data_feed=None):
-        assert (op is not None)
-        assert (data_feed is not None)
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_test_op()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.topk
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = False
+        self.attrs = {}
+        if not self.use_k_as_const_variable:
+            self.attrs["k"] = 3
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                if not self.use_K_as_const_variable:
-                    topk_values, topk_indices = op(x, **self.attrs)
+
+                if not self.use_k_as_const_variable:
+                    topk_values, topk_indices = self.op(x, **self.attrs)
                 else:
                     # !important, popart cannot accept non const tensor
-                    # K_t = paddle.static.data(name="in_1", shape=[1], dtype='int32')
-                    K_t = fluid.layers.fill_constant(
+                    K_t = paddle.fluid.layers.fill_constant(
                         shape=[1], dtype='int32', value=self.k, name="in_2")
-                    topk_values, topk_indices = op(x, K_t, **self.attrs)
+                    topk_values, topk_indices = self.op(x, K_t, **self.attrs)
+
                 fetch_list = [topk_values.name, topk_indices.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            print("Running inference ...")
-            result = exe.run(program, feed=data_feed, fetch_list=fetch_list)
-            print("Complete running infrence.")
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result
 
     def test_base(self):
-        for op in self.ops:
-            res0_topk_values, res0_topk_indices = self._test_base(
-                True, op=op, data_feed=self.feed)
-            res1_topk_values, res1_topk_indices = self._test_base(
-                False, op=paddle.fluid.layers.topk, data_feed=self.feed)
-
-            print("[TestTopKop] IPU res0 values:\n%s\n" % res0_topk_values)
-            print("[TestTopKop] CPU res1 values:\n%s\n" % res1_topk_values)
-            view_type = np.uint32
-            print("[TestTopKop] IPU res0 indices:\n%s\n" %
-                  res0_topk_indices.astype(view_type))
-            print("[TestTopKop] CPU res1 indices:\n%s\n" % res1_topk_indices)
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_values.flatten(),
-                    res1_topk_values.flatten(),
-                    atol=self.atol))
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_indices.astype(view_type).flatten(),
-                    res1_topk_indices.flatten(),
-                    atol=self.atol))
+        value_dict = {}
+        index_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            value, index = self._test_base(mode)
+            value_dict[mode] = value
+            index_dict[mode] = index
+
+        self.check(value_dict)
+        self.check(index_dict)
+
+
+class TestCase2(TestTopKOp):
+    def set_test_op(self):
+        self.op = paddle.topk
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase3(TestTopKOp):
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = True
+        self.attrs = {}
+        self.k = 2
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase4(TestCase3):
+    def set_test_op(self):
+        self.op = paddle.topk
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 77d2f4131014965ef6cfba9cdf5efffa34c2dbc6..1747bde20b6a63c9bde705f3e7c998b7d2033a94 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,94 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 2, 3, 1]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.transpose(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 1, 2, 3]}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 3, 4, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3, 4, 5])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [4, 0, 2, 3, 1]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 75ed5a07315c775e4ea3e105a1fa4d6731666c70..e068c2e3b59083a9623ae9cf2a6025cde4fe18ea 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,79 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[1, 2, 3]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": -1}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [1, 2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
rename to python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
index fabad936decb975214f0416000d77c76a2f7ddfb..5cc62432dc635bcbeca727fcb8eae5ea48849ffe 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
@@ -16,15 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-from paddle.fluid.executor import global_scope
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,11 +26,11 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
         }
@@ -45,25 +38,22 @@ class TestBase(IPUOpTest):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -76,12 +66,13 @@ class TestBase(IPUOpTest):
                 scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5)
                 scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7)
 
-                fetch_list = [scale3.name]
+            fetch_list = [scale3.name]
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
             scale1_out = main_prog.global_block().ops[4].output("Out")[0]
@@ -92,8 +83,8 @@ class TestBase(IPUOpTest):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf1c61f52e83261cca4016b062bc327a4c062dd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestWeightSharing(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.randint(0, 768, size=(128, 1)).astype(np.int32)
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {
+            "x": np.tile(x.astype(np.int64)[np.newaxis, :], [3, 1, 1])
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    y = paddle.fluid.layers.embedding(
+                        input=x,
+                        size=[768, 768],
+                        dtype='float32',
+                        param_attr=paddle.fluid.ParamAttr(
+                            name='word_embedding'),
+                        is_sparse=False)
+
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    z = paddle.fluid.layers.fc(
+                        input=y,
+                        size=768,
+                        param_attr=paddle.fluid.ParamAttr(name="fc"))
+
+                with paddle.static.ipu_shard_guard(index=0, stage=2):
+                    out = paddle.fluid.layers.matmul(
+                        x=z,
+                        y=main_prog.global_block().var('word_embedding'),
+                        transpose_y=True)
+
+            fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(
+                    num_ipus=2,
+                    is_training=self.is_training,
+                    enable_manual_shard=True)
+                ipu_strategy.set_pipelining_config(
+                    enable_pipelining=True, batches_per_step=3)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_ipu if run_ipu else self.feed_cpu
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1[0].flatten(), atol=self.atol))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 6169509e895184e599f5620a4aa6acdde6470792..8f7b73fc0e03630f0d1c8a64ed9118f1322c5b65 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -80,7 +80,7 @@ if(WITH_NV_JETSON)
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
 else()
-  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 120)
+  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 300)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
 endif()
 set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
deleted file mode 100644
index 5afaf08eec3b1324df312920bd9e8c8970fd7dbc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume, reproduce_failure
-import hypothesis.strategies as st
-
-
-class TestConvAffineChannelFusePass(PassAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_config(self, draw):
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
-        groups = draw(st.integers(min_value=1, max_value=3))
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
-        axis = draw(st.sampled_from([1]))
-        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
-        filter_size = draw(st.integers(min_value=1, max_value=4))
-        in_channel = groups * filter_channel
-        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
-        out_channel = groups * out_channel_factor
-        batch_size = draw(st.integers(min_value=1, max_value=4))
-        dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
-        strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        has_bias = draw(st.booleans())
-
-        x_shape = [
-            batch_size, in_channel, 64, 64
-        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
-        w_shape = [out_channel, filter_channel, filter_size, filter_size]
-        scale_shape = [out_channel]
-        bias_shape = [out_channel]
-
-        def generate_input():
-            return np.random.random(x_shape).astype(np.float32)
-
-        def generate_weight():
-            return np.random.random(w_shape).astype(np.float32)
-
-        def generate_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        def generate_scale_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["conv_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
-        if has_bias == True:
-            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
-        ops = [conv2d_op, ac_op]
-
-        program_config = ProgramConfig(
-            ops=ops,
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input)),
-            },
-            weights={
-                "conv2d_weight":
-                TensorConfig(data_gen=partial(generate_weight)),
-                "affine_channel_scale":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-                "affine_channel_bias":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-            },
-            outputs=["affine_channel_ouput"])
-        if has_bias == True:
-            program_config.weights["conv2d_bias"] = TensorConfig(
-                data_gen=partial(generate_bias))
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-    def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
-        # in is_program_valid needs to be deleted!!!
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        # mkldnn Output has diff with bias!
-        def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
-
-        self.add_ignore_check_case(
-            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC, \
-            because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
-        )
-
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Currently mkldnn Output has diff with bias!")
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            passes=["conv_affine_channel_fuse_pass"], )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
deleted file mode 100644
index a8bfdb79ca1daa5caa0cffb945fee76fdef36c36..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume
-import hypothesis.strategies as st
-
-
-class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-
-        if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3:
-            return False
-
-        return True
-
-    def sample_program_config(self, draw):
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
-        groups = draw(st.integers(min_value=1, max_value=3))
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
-        axis = draw(st.sampled_from([1]))
-        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
-        filter_size = draw(st.integers(min_value=1, max_value=4))
-        in_channel = groups * filter_channel
-        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
-        out_channel = groups * out_channel_factor
-        batch_size = draw(st.integers(min_value=1, max_value=4))
-        dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
-        strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        has_bias = draw(st.booleans())
-
-        x_shape = [
-            batch_size, in_channel, 64, 64
-        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
-        w_shape = [out_channel, filter_channel, filter_size, filter_size]
-        scale_shape = [out_channel]
-        bias_shape = [out_channel]
-
-        def generate_input():
-            return np.random.random(x_shape).astype(np.float32)
-
-        def generate_weight():
-            return np.random.random(w_shape).astype(np.float32)
-
-        def generate_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        def generate_scale_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        eltwise_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv_output"],
-                    "Y": ["conv2d_bias"]},
-            outputs={"Out": ["elementwise_output"]},
-            axis=axis)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["elementwise_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
-        if has_bias == True:
-            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
-        ops = [conv2d_op, eltwise_op, ac_op]
-        program_config = ProgramConfig(
-            ops=ops,
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input)),
-            },
-            weights={
-                "conv2d_weight":
-                TensorConfig(data_gen=partial(generate_weight)),
-                "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
-                "affine_channel_scale":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-                "affine_channel_bias":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-            },
-            outputs=["affine_channel_ouput"])
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        # TRT
-        config = self.create_trt_inference_config()
-        config.enable_tensorrt_engine(
-            workspace_size=1 << 20,
-            max_batch_size=4,
-            min_subgraph_size=1,
-            precision_mode=paddle_infer.PrecisionType.Float32,
-            use_static=False,
-            use_calib_mode=False)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-    def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
-        # in is_program_valid needs to be deleted!!!
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        # mkldnn Output has diff with bias!
-        def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
-
-        self.add_ignore_check_case(
-            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC, \
-            it will trigger Broadcast dimension mismatch bug \
-            when data_format attribute is NHWC and axis of eltwise op is 1 for this pass."
-        )
-
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Currently mkldnn Output has diff with bias!")
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            passes=["conv_eltwiseadd_affine_channel_fuse_pass"], )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index e21d67839eb6c0f097ca82748c1b7d03ca39d8d3..65fc35f9c56f88fec8fed6b3ecdadb10ddcc8c8f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -37,6 +37,13 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -175,9 +182,9 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -187,41 +194,18 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
index 9d29034d7fe18d069dd7bc3b4651a4c97d13976a..c692e92861bc6cc6bae2354e287c2554639af825 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
@@ -147,7 +147,7 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
             if len(attrs[0]['paddings']) == 4:
                 return 1, 2
             else:
-                return 1, 2
+                return 1, 4
 
         attrs = [
             program_config.ops[i].attrs
@@ -160,20 +160,8 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), 1e-5
 
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(program_config.ops[0].attrs["strides"]) != 2:
-                return False
-
-            return True
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "In deformable conv, length of Attr(strides) should be 2.")
-
     def test(self):
         self.trt_param.workspace_size = 1 << 28
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 66a007f64b69c01fe6bb197f634625a526c3a796..5f77e7de0df423228d97372e6ccc42dfe458cf9a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -40,6 +40,13 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         if inputs['input_data'].shape[1] != attrs[0]['groups']:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -139,9 +146,9 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -151,41 +158,18 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
index 356a2c942df0d8cc5d1f016a3b2f4a284227990f..1eecf9c0497a196666c4b30af721c7b68d77b0fd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
@@ -244,28 +244,16 @@ class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 5), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 5), 1e-5
+        yield self.create_inference_config(), (0, 5), 2e-2
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 4), 1e-5
-
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Half and len(
-                    self.dynamic_shape.min_input_shape) != 0:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt when dynamic fp16 mode.")
+        yield self.create_inference_config(), (1, 4), 2e-2
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 9bcbbf95990f2046edd982870240c9d669f920ee..852bb2ffa8412353ec152d76f9d6843f1654e862 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -138,7 +138,7 @@ class TrtConvertGatherTest(TrtLayerAutoScanTest):
                     "index_data": [1]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data": [128, 256, 128, 256],
+                    "input_data": [128, 256, 64, 128],
                     "index_data": [4]
                 }
                 self.dynamic_shape.opt_input_shape = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index 2d2072d277e9c50e74aea27ed82144d979566cbc..97a94ef348a678bdf59c0058746f19b78ae27f9b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -451,10 +451,394 @@ class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
             "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2."
         )
 
+        def teller3(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt in int8 mode.")
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
 
 
+class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
+    def sample_program_configs(self):
+        def generate_input1(batch, dim1):
+            return np.random.random((batch, dim1, 768)).astype(np.float32)
+
+        def generate_input2(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight1():
+            return np.random.random((768, 768)).astype(np.float32)
+
+        def generate_weight2():
+            return np.random.random(768).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            self.batch = batch
+            for reshape_shape in [[0, 0, 12, 64]]:
+                for dim1 in [128]:
+                    input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
+                                     [batch, 1, 1, dim1]]
+                    for input2_shape in input2_shapes:
+                        for axis in [0]:
+                            dics = [{
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "scale": 0.125,
+                                "bias": 0.0,
+                                "bias_after_scale": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": True,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": axis
+                            }, {
+                                "axis": -1,
+                                "is_test": True
+                            }, {
+                                "seed": 0,
+                                "dropout_prob": 0.10000000149011612,
+                                "dropout_implementation": "upscale_in_train",
+                                "fix_seed": False,
+                                "is_test": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": False,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "shape": [0, 0, 768]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }]
+
+                            ops_config = [
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul1_output"]
+                                    },
+                                    "op_attrs": dics[0]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul1_output"],
+                                        "Y": ["elementwise_add1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add1_output"]
+                                    },
+                                    "op_attrs": dics[1]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add1_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape21_output"],
+                                        "XShape": ["reshape21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[2]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape21_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose21_output"],
+                                        "XShape":
+                                        ["transpose21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[3]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul2_output"]
+                                    },
+                                    "op_attrs": dics[4]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul2_output"],
+                                        "Y": ["elementwise_add2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add2_output"]
+                                    },
+                                    "op_attrs": dics[5]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape22_output"],
+                                        "XShape": ["reshape22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[6]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape22_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose22_output"],
+                                        "XShape":
+                                        ["transpose22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[7]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul3_output"]
+                                    },
+                                    "op_attrs": dics[8]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul3_output"],
+                                        "Y": ["elementwise_add3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add3_output"]
+                                    },
+                                    "op_attrs": dics[9]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add3_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape23_output"],
+                                        "XShape": ["reshape23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[10]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape23_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose23_output"],
+                                        "XShape":
+                                        ["transpose23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[11]
+                                },
+                                {
+                                    "op_type": "scale",
+                                    "op_inputs": {
+                                        "X": ["transpose23_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["scale_output"]
+                                    },
+                                    "op_attrs": dics[12]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["scale_output"],
+                                        "Y": ["transpose22_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul1_output"]
+                                    },
+                                    "op_attrs": dics[13]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["matmul1_output"],
+                                        "Y": ["input_data2"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add4_output"]
+                                    },
+                                    "op_attrs": dics[14]
+                                },
+                                {
+                                    "op_type": "softmax",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add4_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["softmax_output"]
+                                    },
+                                    "op_attrs": dics[15]
+                                },
+                                {
+                                    "op_type": "dropout",
+                                    "op_inputs": {
+                                        "X": ["softmax_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["dropout3_output"]
+                                    },
+                                    "op_attrs": dics[16]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["dropout3_output"],
+                                        "Y": ["transpose21_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul2_output"]
+                                    },
+                                    "op_attrs": dics[17]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["matmul2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose24_output"],
+                                        "XShape":
+                                        ["transpose24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[18]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["transpose24_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape24_output"],
+                                        "XShape": ["reshape24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[19]
+                                },
+                                # In order to fuse ops with 
+                                # multihead_matmul_fuse_pass_v2, the last op
+                                # must be mul.
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["reshape24_output"],
+                                        "Y": ["mul4_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul4_output"]
+                                    },
+                                    "op_attrs": dics[20]
+                                }
+                            ]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={
+                                    "mul1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul4_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "elementwise_add1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                },
+                                inputs={
+                                    "input_data1": TensorConfig(
+                                        data_gen=partial(generate_input1, batch,
+                                                         dim1)),
+                                    "input_data2": TensorConfig(
+                                        data_gen=partial(generate_input2,
+                                                         input2_shape)),
+                                },
+                                outputs=["mul4_output"])
+
+                            yield program_config
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index c17790bd3200e2bac9841c1198572af6e1740420..340378225261ff829e4faa334a471cea8e88f4d9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -13,13 +13,19 @@ if (WITH_MLU)
     endforeach(TEST_OP)
 
     if(WITH_CNCL)
-	foreach(TEST_OP ${TEST_DIST_OPS})
+        foreach(TEST_OP ${TEST_DIST_OPS})
             py_test_modules(${TEST_OP} MODULES ${TEST_OP})
         endforeach(TEST_OP)
         bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
     endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91f28e3b1db8687957d57e5fd22ef4a791cd2cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+import paddle
+
+paddle.enable_static()
+
+
+class TestCCommInitOp(unittest.TestCase):
+    def setUp(self):
+        self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
+        self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        self.nranks = len(self.endpoints)
+        self.rank = self.endpoints.index(self.current_endpoint)
+        self.mlu_id = int(os.getenv("FLAGS_selected_mlus"))
+        self.place = fluid.MLUPlace(self.mlu_id)
+        self.exe = fluid.Executor(self.place)
+        self.endpoints.remove(self.current_endpoint)
+        self.other_endpoints = self.endpoints
+        if self.rank == 0:
+            wait_server_ready(self.other_endpoints)
+
+    def test_specifying_devices(self):
+        program = fluid.Program()
+        block = program.global_block()
+        cncl_id_var = block.create_var(
+            name=fluid.unique_name.generate('cncl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': self.rank,
+                'endpoint': self.current_endpoint,
+                'other_endpoints': self.other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': self.nranks,
+                'rank': self.rank,
+                'ring_id': 0,
+                'device_id': self.mlu_id
+            })
+        self.exe.run(program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
new file mode 100755
index 0000000000000000000000000000000000000000..50ae6b1a169d7704e55cf2486e396e6e3b0663f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..1058514f9ca24c6c9be8e6e48499c81c158a82ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allgather",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgather, "allgather", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe4e71d22fdee38779766177031bfcea64c6974
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..2002909ea2eec5e85b91fad0c53ed2c480a01ef0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype="float32")
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
index e229966c12d245921b1b059efa011d07405ada94..5e5c4c9a301e9353497289082f322f63ae9a981e 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
@@ -23,6 +23,8 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestAccuracyOp(OpTest):
     def setUp(self):
@@ -132,5 +134,4 @@ class TestAccuracyAPI(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 2150e06381fac37e527d9d593f8752eb38ba1596..4cbff21dfc4965d8aa47955437ac90a7b62dd13e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -29,6 +29,7 @@ import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
 _set_use_system_allocator(True)
+paddle.enable_static()
 
 
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
@@ -698,5 +699,4 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index f608344f6e0363864a76a23f8d8c10dace130149..7dd9dcdee57f99e71c4bb889e9a03ef248dbcfe7 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -26,6 +26,8 @@ import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import paddle
 
+paddle.enable_static()
+
 
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
@@ -291,5 +293,4 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..97f21798c115456852d0752f22f7534bbd3ab0d1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# use default values
+# FIXME: random fails on Unknown command lines -c (or -m).
+launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
+MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
index 71f79c34d2312eda3952447a5748e66b44d1ab82..10356b124b2ea3a020bbfbf95b9cc0778f193a99 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
@@ -25,6 +25,8 @@ import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestCastOpFp32ToFp16(OpTest):
     def setUp(self):
@@ -119,17 +121,7 @@ class TestCastOpError(unittest.TestCase):
             x1 = fluid.create_lod_tensor(
                 np.array([[-1]]), [[1]], fluid.MLUPlace(0))
             self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
-            # The input dtype of cast_op must be bool, float16, float32, float64, int32, int64, uint8.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='int16')
-            self.assertRaises(TypeError, fluid.layers.cast, x2, 'int32')
-
-            def test_dtype_type():
-                x4 = fluid.layers.data(name='x4', shape=[4], dtype='int32')
-                output = fluid.layers.cast(x=x4, dtype='int16')
-
-            self.assertRaises(TypeError, test_dtype_type)
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
new file mode 100644
index 0000000000000000000000000000000000000000..09166e15aac8191c39047aee1fb12cacf0195d55
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllgatherOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_fp32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float32")
+
+    def test_allgather_fp16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float16")
+
+    def test_allgather_int32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int32")
+
+    def test_allgather_int16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int16")
+
+    def test_allgather_int8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather", "int8")
+
+    def test_allgather_uint8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
new file mode 100755
index 0000000000000000000000000000000000000000..576c310cc3ac2957cd659c248774a34ece422e87
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_cncl_fp16(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float16")
+
+    def test_allgather_cncl_fp32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float32")
+
+    def test_allgather_cncl_int32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..447498b9022d4b544eeff644f8a5dcd8afb4a800
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_cncl_fp16(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_cncl_fp32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_cncl_int32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c1cf7d2d1b2ba010511f2056759bae72328c16f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import os
+import sys
+import subprocess
+import pickle
+from contextlib import closing
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+def DataTypeCast(date_type):
+    np_data_type = None
+
+    if date_type == "float16":
+        np_data_type = np.float16
+    elif date_type == "float32":
+        np_data_type = np.float32
+    elif date_type == "int32":
+        np_data_type = np.int32
+    else:
+        raise ValueError("This data type is not support!")
+
+    return np_data_type
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank, indata=None):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        paddle.distributed.init_parallel_env()
+        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
+        place = fluid.MLUPlace(device_id)
+        np.random.seed(os.getpid())
+        np_data_type = DataTypeCast(args["data_type"])
+        indata = np.random.random((10, 1000)).astype(np_data_type)
+        if args['static_mode']:
+            result = self.get_model(train_prog, startup_prog, rank)
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            fetch_list = []
+            for elem in result:
+                fetch_list.append(elem.name)
+            out = exe.run(train_prog,
+                          feed={'tindata': indata},
+                          fetch_list=fetch_list)
+        else:
+            out = self.get_model(train_prog, startup_prog, rank, indata)
+            #print(out, sys.stderr)
+        sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    args["static_mode"] = int(os.getenv("STATIC_MODE"))
+    args["data_type"] = os.getenv("DATA_TYPE")
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_mlus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_mlus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            tr_cmd = "%s -m coverage run --branch -p %s"
+        else:
+            tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
+        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         data_type,
+                         path_id="0",
+                         static_mode="1",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
+            "STATIC_MODE": static_mode,
+            "PADDLE_WITH_GLOO": '0',
+            "BACKEND": "cncl",
+            "PATH_ID": path_id,
+            "DATA_TYPE": data_type
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np_data_type = DataTypeCast(data_type)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000)).astype(np_data_type)
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000)).astype(np_data_type)
+        if col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 4692c893d00b4595c1927f01c7e1b55dd6935c70..9c2e2205eb8761f93f4d16ab8934c9f1f256d92d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -270,5 +270,9 @@ class TestDistBase(unittest.TestCase):
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..95919f33328691269ce2ffcdfa9b771dd9c6cf96
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_cncl_fp16(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float16")
+
+    def test_broadcast_cncl_fp32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float32")
+
+    def test_broadcast_cncl_int32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..87997acce02a364c453ff945896dbe323e34ad07
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
@@ -0,0 +1,157 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(OpTest):
+        def setUp(self):
+            self.set_mlu()
+            self.place = paddle.MLUPlace(0)
+            x = np.random.random(size=(10, 7)).astype(typename)
+            y = np.random.random(size=(10, 7)).astype(typename)
+            out = callback(x, y)
+            self.inputs = {'X': x, 'Y': y}
+            self.outputs = {'Out': out}
+            self.op_type = op_type
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+
+        def test_output(self):
+            self.check_output_with_place(place=self.place)
+
+        def test_errors(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                a = fluid.layers.data(name='a', shape=[2], dtype='float32')
+                b = fluid.layers.data(name='b', shape=[2], dtype='float32')
+                c = fluid.layers.data(name='c', shape=[2], dtype='int16')
+                d = fluid.create_lod_tensor(np.array([[-1]]), [[1]], self.place)
+
+                op = eval("fluid.layers.%s" % self.op_type)
+                self.assertRaises(TypeError, op, x=a, y=b, axis=True)
+                self.assertRaises(TypeError, op, x=a, y=b, force_cpu=1)
+                self.assertRaises(TypeError, op, x=a, y=b, cond=1)
+                self.assertRaises(TypeError, op, x=a, y=c)
+                self.assertRaises(TypeError, op, x=c, y=a)
+                self.assertRaises(TypeError, op, x=a, y=d)
+                self.assertRaises(TypeError, op, x=d, y=a)
+                self.assertRaises(TypeError, op, x=c, y=d)
+
+        def test_dynamic_api(self):
+            paddle.disable_static()
+            paddle.set_device('mlu:0')
+            x = np.random.random(size=(10, 7)).astype(typename)
+            y = np.random.random(size=(10, 7)).astype(typename)
+            real_result = callback(x, y)
+            x = paddle.to_tensor(x, dtype=typename)
+            y = paddle.to_tensor(y, dtype=typename)
+            op = eval("paddle.%s" % (self.op_type))
+            out = op(x, y)
+            self.assertEqual((out.numpy() == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_1(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype=typename)
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 3], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
+                input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 3], dtype=typename)
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype=typename)
+                y = paddle.static.data(name='y', shape=[3, 1], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(typename)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_attr_name(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[4], dtype=typename)
+                y = fluid.layers.data(name='y', shape=[4], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x=x, y=y, name="name_%s" % (self.op_type))
+            self.assertEqual("name_%s" % (self.op_type) in out.name, True)
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float16', 'float32', 'int32', 'bool'}:
+    if _type_name == 'int32' or _type_name == 'bool':
+        create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+        continue
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
index 3bfa96b70011238b48f55d628ad17794b84ff5de..ba37fcee15472a46cd2c64ddd5418f525e759569 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
@@ -176,7 +176,7 @@ def create_test_AxisTensor(parent):
     class TestConcatAxisTensor(parent):
         def setUp(self):
             self.op_type = "concat"
-            self.dtype = self.init_dtype()
+            self.init_dtype()
             self.init_test_data()
 
             self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
index 5b6db6903fba0dc1d77b6026623a3bc3c101013c..3dc711c7d75e1e93bfc727e3979af8a9e0d55bd1 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
@@ -23,6 +23,8 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestElementwiseAddOp(OpTest):
     def set_mlu(self):
@@ -523,5 +525,4 @@ class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
index 6610127d382bd3a715b64ad359c500fefc595936..a43b7d0164d7bb3154999db1dcf61e79a537e021 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
@@ -27,6 +27,8 @@ import paddle.fluid as fluid
 import numpy as np
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
@@ -449,5 +451,4 @@ class TestFillConstantOpError(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
index 97a945dc905715206a8042a7fa2ea9bacb5f015c..6f64196a586dd0069f9340286a97077a3676809b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
@@ -26,6 +26,8 @@ sys.path.append('..')
 from op_test import OpTest
 import paddle
 
+paddle.enable_static()
+
 
 class TestGaussianRandomOp(OpTest):
     def setUp(self):
@@ -74,5 +76,4 @@ class TestMeanStdAreInt(TestGaussianRandomOp):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..adfff112e6be216330ca8e542257944a1b32213b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
@@ -0,0 +1,329 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2022
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "matmul"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y,
+                               self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y,
+            'alpha': self.alpha
+        }
+        self.outputs = {'Out': Out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+# TODO(mlu): alpha will be supported in next version
+#--------------------test matmul alpha--------------------
+# def create_test_alpha_class(parent):
+#     class TestMatMulOpAlphaCase(parent):
+#         def init_alpha(self):
+#             self.alpha = 0.125
+
+#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+#     TestMatMulOpAlphaCase.__name__ = cls_name
+#     globals()[cls_name] = TestMatMulOpAlphaCase
+
+# create_test_alpha_class(TestMatMulOp)
+# create_test_alpha_class(TestMatMulOp1)
+# create_test_alpha_class(TestMatMulOp2)
+# create_test_alpha_class(TestMatMulOp3)
+# create_test_alpha_class(TestMatMulOp4)
+# create_test_alpha_class(TestMatMulOp5)
+# create_test_alpha_class(TestMatMulOp6)
+# create_test_alpha_class(TestMatMulOp9)
+# create_test_alpha_class(TestMatMulOp10)
+# create_test_alpha_class(TestMatMulOp11)
+# create_test_alpha_class(TestMatMulOp12)
+# create_test_alpha_class(TestMatMulOp13)
+
+
+#--------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
index af09eabe787dc3d0ca4aa84852847f06cea39927..a2cd69fee325a6074ac356885b5f851b5140649c 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
@@ -26,6 +26,8 @@ import paddle.fluid as fluid
 import numpy
 from test_momentum_op import calculate_momentum_by_numpy
 
+paddle.enable_static()
+
 
 class TestMomentumOp1(OpTest):
     def setUp(self):
@@ -608,5 +610,4 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
index fd442c6205e98d26b4797ff2ef4499b376bc8bdd..1be3d2d85a4220406c90d8b164d7d9b3731b9f87 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
@@ -27,6 +27,8 @@ sys.path.append('..')
 from op_test import OpTest
 from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
 
+paddle.enable_static()
+
 
 def pool2d_backward_navie(x,
                           ksize,
@@ -1016,5 +1018,4 @@ class TestDygraphPool2DAPI(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
index bb7f438c4ab2b5eaae7ac2df2f7c1978186b1a05..53254c738d985db028c7fdfe80b715a14ceaec03 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
@@ -25,6 +25,8 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.static import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestScaleOp(OpTest):
     def setUp(self):
@@ -201,5 +203,4 @@ class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
index 8ad0e787ab0cc277a4086f6c98f21f7d15fc09d7..366f783ce0d2a1bf482e5bbba0377cc445d26d1e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
@@ -22,6 +22,8 @@ from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
 
+paddle.enable_static()
+
 
 class TestTopkOp(OpTest):
     def setUp(self):
@@ -69,5 +71,4 @@ class TestTopkFP16Op(TestTopkOp):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f1bda477f07d1b115ceee6d7f5a9df2768a5fcf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
@@ -0,0 +1,393 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initKernelType()
+        self.initTestCase()
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        self.attrs = {'axis': list(self.axis), }
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
+
+    def initKernelType(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+
+class TestCase0(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.axis = (0, )
+
+
+class TestCase1(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+
+
+class TestCase2(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+class TestCase5(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 16, 96)
+        self.axis = (0, 2, 1)
+
+
+class TestCase6(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 10, 12, 16)
+        self.axis = (3, 1, 2, 0)
+
+
+class TestCase7(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 10, 2, 16)
+        self.axis = (0, 1, 3, 2)
+
+
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
+class TestTransposeOpBool(TestTransposeOp):
+    def test_check_grad(self):
+        pass
+
+
+class TestTransposeOpBool1D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.axis = (0, )
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool2D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool3D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool4D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool5D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool6D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool7D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool8D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(name='x', shape=[10, 5, 3], dtype='float32')
+
+            def test_x_Variable_check():
+                # the Input(x)'s type must be Variable
+                fluid.layers.transpose("not_variable", perm=[1, 0, 2])
+
+            self.assertRaises(TypeError, test_x_Variable_check)
+
+            def test_perm_list_check():
+                # Input(perm)'s type must be list
+                fluid.layers.transpose(x, perm="[1, 0, 2]")
+
+            self.assertRaises(TypeError, test_perm_list_check)
+
+            def test_perm_length_and_x_dim_check():
+                # Input(perm) is the permutation of dimensions of Input(input)
+                # its length should be equal to dimensions of Input(input)
+                fluid.layers.transpose(x, perm=[1, 0, 2, 3, 4])
+
+            self.assertRaises(ValueError, test_perm_length_and_x_dim_check)
+
+            def test_each_elem_value_check():
+                # Each element in Input(perm) should be less than Input(x)'s dimension
+                fluid.layers.transpose(x, perm=[3, 5, 7])
+
+            self.assertRaises(ValueError, test_each_elem_value_check)
+
+
+class TestTransposeApi(unittest.TestCase):
+    def test_static_out(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name='x', shape=[2, 3, 4], dtype='float32')
+            x_trans1 = paddle.transpose(x, perm=[1, 0, 2])
+            x_trans2 = paddle.transpose(x, perm=(2, 1, 0))
+            place = paddle.MLUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_np = np.random.random([2, 3, 4]).astype("float32")
+            result1, result2 = exe.run(feed={"x": x_np},
+                                       fetch_list=[x_trans1, x_trans2])
+            expected_result1 = np.transpose(x_np, [1, 0, 2])
+            expected_result2 = np.transpose(x_np, (2, 1, 0))
+
+            np.testing.assert_array_equal(result1, expected_result1)
+            np.testing.assert_array_equal(result2, expected_result2)
+
+    def test_dygraph_out(self):
+        # This is an old test before 2.0 API so we need to disable static
+        # to trigger dygraph
+        paddle.disable_static()
+        x = paddle.randn([2, 3, 4])
+        x_trans1 = paddle.transpose(x, perm=[1, 0, 2])
+        x_trans2 = paddle.transpose(x, perm=(2, 1, 0))
+        x_np = x.numpy()
+        expected_result1 = np.transpose(x_np, [1, 0, 2])
+        expected_result2 = np.transpose(x_np, (2, 1, 0))
+
+        np.testing.assert_array_equal(x_trans1.numpy(), expected_result1)
+        np.testing.assert_array_equal(x_trans2.numpy(), expected_result2)
+        # This is an old test before 2.0 API so we enable static again after
+        # dygraph test
+        paddle.enable_static()
+
+
+class TestTAPI(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[10], dtype="float32", name="data")
+            data_t = paddle.t(data)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            data_np = np.random.random([10]).astype("float32")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[10, 5], dtype="float32", name="data")
+            data_t = paddle.t(data)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            data_np = np.random.random([10, 5]).astype("float32")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[1, 5], dtype="float32", name="data")
+            data_t = paddle.t(data)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            data_np = np.random.random([1, 5]).astype("float32")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([10]).astype("float32")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([10, 5]).astype("float32")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([1, 5]).astype("float32")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name='x', shape=[10, 5, 3], dtype='float32')
+
+            def test_x_dimension_check():
+                paddle.t(x)
+
+            self.assertRaises(ValueError, test_x_dimension_check)
+
+
+class TestMoveAxis(unittest.TestCase):
+    def test_moveaxis1(self):
+        x_np = np.random.randn(2, 3, 4, 5, 7).astype('float32')
+        expected = np.moveaxis(x_np, [0, 4, 3, 2], [1, 3, 2, 0])
+        paddle.enable_static()
+        with paddle.static.program_guard(fluid.Program()):
+            x = paddle.static.data("x", shape=[2, 3, 4, 5, 7], dtype='float32')
+            out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0])
+
+            exe = paddle.static.Executor()
+            out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0]
+
+        self.assertEqual(np.array_equal(out_np, expected), True)
+
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np)
+        out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0])
+        self.assertEqual(out.shape, [4, 2, 5, 7, 3])
+        self.assertEqual(np.array_equal(out.numpy(), expected), True)
+        paddle.enable_static()
+
+    def test_moveaxis2(self):
+        x_np = np.random.randn(2, 3, 5).astype('float32')
+        expected = np.moveaxis(x_np, -2, -1)
+        paddle.enable_static()
+        with paddle.static.program_guard(fluid.Program()):
+            x = paddle.static.data("x", shape=[2, 3, 5], dtype='float32')
+            out = x.moveaxis(-2, -1)
+
+            exe = paddle.static.Executor()
+            out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0]
+
+        self.assertEqual(np.array_equal(out_np, expected), True)
+
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np)
+        out = x.moveaxis(-2, -1)
+        self.assertEqual(out.shape, [2, 5, 3])
+        self.assertEqual(np.array_equal(out.numpy(), expected), True)
+        paddle.enable_static()
+
+    def test_error(self):
+        x = paddle.randn([2, 3, 4, 5])
+        # src must have the same number with dst
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [1, 0], [2])
+
+        # each element of src must be unique
+        with self.assertRaises(ValueError):
+            paddle.moveaxis(x, [1, 1], [0, 2])
+
+        # each element of dst must be unique
+        with self.assertRaises(ValueError):
+            paddle.moveaxis(x, [0, 1], [2, 2])
+
+        # each element of src must be integer
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [0.5], [1])
+
+        # each element of dst must be integer
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [0], [1.5])
+
+        # each element of src must be in the range of [-4, 3)
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [-10, 1], [2, 3])
+
+        # each element of dst must be in the range of [-4, 3)
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [2, 1], [10, 3])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..37a24885be1bf646fb7fac92a3d9cece5ef17cfe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupHCCL(store, rank, nranks)
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_nccl(self):
+        with _test_eager_guard():
+            paddle.set_device('npu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+            exit(0)
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+class TestProcessGroupFp16(TestProcessGroupFp32):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float16"
+        self.shape = (4, 20, 20)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2c6fae15eb4b19ba8d2dbe3e6c72a8c8ae9f4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestProcessGroup(TestMultipleGpus):
+    def test_process_group_nccl(self):
+        self.run_mnist_2gpu('process_group_hccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 848ebae0706e3c62e0e0e6579cd3c04f02d43be4..457f20ac5b06be0afb6929dc74148eb42b3ce4db 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -29,6 +29,7 @@ from copy import copy
 
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.framework import _dygraph_tracer
 import paddle.fluid.core as core
 from paddle.fluid.framework import _in_eager_mode
 from paddle.fluid.framework import _test_eager_guard
@@ -49,6 +50,7 @@ from paddle.fluid.tests.unittests.white_list import (
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 
 def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
@@ -395,6 +397,7 @@ class OpTest(unittest.TestCase):
             hasattr(self, "attrs") and "use_xpu" in self.attrs and
             self.attrs["use_xpu"] == True)
 
+    # set the self.output_dtype .
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
@@ -482,7 +485,12 @@ class OpTest(unittest.TestCase):
 
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
         "infer datatype from inputs and outputs for this test case"
-        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        if self.is_bfloat16_op():
+            self.dtype = np.uint16
+            self.__class__.dtype = self.dtype
+            self.output_dtype = np.uint16
+        else:
+            self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
         inputs = append_input_output(block, op_proto, self.inputs, True,
                                      self.dtype)
         outputs = append_input_output(block, op_proto, self.outputs, False,
@@ -674,6 +682,133 @@ class OpTest(unittest.TestCase):
         else:
             return var_dict
 
+    def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
+        """ for quick verify, here we take a simplest strategy:
+                1. we only check variable in api_outs.
+                2. we simply check the numpy (tensor) .
+                3. we set atol and rtol as 1e-5, because they are unrelated to dtype.
+        """
+        for name in api_outs:
+            np_api = np.array(api_outs[name])
+            np_dyg = np.array(dygraph_outs[name])
+            self.assertTrue(
+                np.allclose(
+                    np_api, np_dyg, equal_nan=False),
+                "Output (" + name + ") has diff at " + str(place) + "\nExpect "
+                + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " +
+                self.__class__.__name__)
+
+    def _calc_python_api_output(self, place):
+        def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs,
+                                         kernel_sig):
+            """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
+            """
+
+            class Empty:
+                pass
+
+            def is_empty(a):
+                return isinstance(a, Empty)
+
+            def get_default(idx, all_params_number, defaults):
+                related_idx = idx - all_params_number + len(defaults)
+                assert related_idx >= 0, "%d-th arguments don't have default value" % idx
+                return defaults[related_idx]
+
+            def remove_name(x):
+                if isinstance(x, list): return [i for i in x if i != 'name']
+                if isinstance(x, dict):
+                    return {k: v for k, v in x.items() if k != 'name'}
+                assert False, "Only support list or dict."
+
+            def to_defaults_list(params, defaults):
+                return [defaults[p] for p in params if p in defaults]
+
+            # NOTE(xiongkun): why don't use input arguments dicts ? 
+            # Because we don't know the python api name of each arguments.
+            # using parse_arg_and_kwargs, we can get the all api information we need.
+            api_params, api_defaults = [
+                remove_name(item) for item in parse_arg_and_kwargs(api)
+            ]
+            api_defaults = to_defaults_list(api_params, api_defaults)
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            inputs_and_attrs = inputs_sig + attrs_sig
+            assert (
+                len(api_params) == len(inputs_and_attrs)
+            ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)"
+            input_arguments = [op_proto_ins[name] for name in inputs_sig] + [
+                op_proto_attrs[name] if name in op_proto_attrs else Empty()
+                for name in attrs_sig
+            ]
+            results = []
+            for idx, arg in enumerate(input_arguments):
+                if is_empty(arg):
+                    results.append(
+                        get_default(idx, len(input_arguments), api_defaults))
+                else:
+                    results.append(arg)
+            return results
+
+        def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
+            if not isinstance(ret_tuple, (tuple, list)):
+                ret_tuple = [ret_tuple]
+            assert len(output_sig) == len(
+                ret_tuple), "expect %d outputs, but get %d outputs" % (
+                    len(output_sig), len(ret_tuple))
+            return {a: b for a, b in zip(output_sig, ret_tuple)}
+
+        def assumption_assert_and_transform(args, inp_num):
+            """
+            transform inputs by the following rules:
+                1. [Tensor] -> Tensor
+                2. [Tensor, Tensor, ...] -> list of Tensors
+
+            only support "X" is list of Tensor, currently don't support other structure like dict.
+            """
+            for inp in args[:inp_num]:
+                assert isinstance(
+                    inp, list
+                ), "currently only support `X` is [Tensor], don't support other structure."
+            args = [
+                inp[0] if len(inp) == 1 else inp for inp in args[:inp_num]
+            ] + args[inp_num:]
+            return args
+
+        def cal_python_api(python_api, args, kernel_sig):
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            args = assumption_assert_and_transform(args, len(inputs_sig))
+            ret_tuple = python_api(*args)
+            return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
+
+        with fluid.dygraph.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+            op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+            # prepare input variable
+            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
+                                                          True, False, block)
+            # prepare output variable
+            outputs = self.append_input_output_for_dygraph(
+                op_proto, self.outputs, False, False, block)
+
+            # prepare attrbutes
+            attrs_outputs = {}
+            if hasattr(self, "attrs"):
+                for attrs_name in self.attrs:
+                    if self.attrs[attrs_name] is not None:
+                        attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
+            kernel_sig = _dygraph_tracer()._get_kernel_signature(
+                self.op_type, inputs, outputs, attrs_outputs)
+
+            assert hasattr(
+                self, "python_api"
+            ), "Please set the `self.python_api` if you want to compare python api output."
+            args = prepare_python_api_arguments(self.python_api, inputs,
+                                                attrs_outputs, kernel_sig)
+            """ we directly return the cal_python_api value because the value is already tensor. 
+            """
+            return cal_python_api(self.python_api, args, kernel_sig)
+
     def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
         with fluid.dygraph.base.guard(place=place):
@@ -694,6 +829,7 @@ class OpTest(unittest.TestCase):
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
             block.append_op(
                 type=self.op_type,
                 inputs=inputs,
@@ -1135,7 +1271,7 @@ class OpTest(unittest.TestCase):
                 else:
                     atol = 2
             else:
-                atol = 1e-2
+                atol = 1e-1
 
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
@@ -1145,10 +1281,17 @@ class OpTest(unittest.TestCase):
         if check_dygraph:
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
+
         if check_eager:
             with _test_eager_guard():
                 eager_dygraph_outs = self._calc_dygraph_output(
                     place, no_check_set=no_check_set)
+            # we only check end2end api when check_eager=True
+            if hasattr(self, "python_api"):
+                api_outs = self._calc_python_api_output(place)
+                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
+                                                     place)
+
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 2a8f72c217055b0166414ab2672b602e08907612..2633a5992563f298c13e855eac5018cdb960a026 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -43,7 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   get_data_from_feeder=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
-                                  use_ir_memory_optimize=True,
+                                  use_ir_memory_optimize=False,
                                   enable_inplace=True,
                                   fuse_elewise_add_act_ops=False,
                                   fuse_all_optimizer_ops=False,
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c62c4615f74707796946137d3b44efc3cc8aeee9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_gloo(self):
+        with _test_eager_guard():
+            nranks = ParallelEnv().nranks
+            rank = ParallelEnv().local_rank
+            is_master = True if rank == 0 else False
+            store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
+                                               nranks, datetime.timedelta(0))
+            gloo_store = paddle.fluid.core.GlooStore(store)
+            opt = paddle.fluid.core.GlooOptions()
+            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+
+            # test allreduce sum
+            # rank 0
+            paddle.device.set_device('cpu')
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = x + y
+            if rank == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if rank == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if rank == 0:
+                task = pg.broadcast(tensor_x, 0)
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                assert np.array_equal(broadcast_result, tensor_y)
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index d999aad63ecf41c8264f52f488757424163a4b24..b1da0777feb3de1b1d6bb59a868802f736afb8e7 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -27,22 +27,13 @@ import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
-ProcessGroupStrategy = core.ProcessGroupStrategy
-
 
 def init_process_group(strategy=None):
-    # this will remove
-    if strategy is None:
-        strategy = ProcessGroupStrategy()
-        strategy.nranks = ParallelEnv().nranks
-        strategy.local_rank = ParallelEnv().local_rank
-        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-        strategy.current_endpoint = ParallelEnv().current_endpoint
-    if strategy.nranks < 2:
-        return
-
-    pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank,
-                                     strategy.nranks)
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupNCCL(store, rank, nranks)
 
     return pg_group
 
@@ -132,6 +123,122 @@ class TestProcessGroupFp32(unittest.TestCase):
 
             print("test broadcast api ok")
 
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
old mode 100644
new mode 100755
index 3aef3283b8200a54b5694d7686275d418a41e431..9af32a8aca741400c2c7c3d4ea48594f289a6adf
--- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
@@ -3,6 +3,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    list(APPEND TEST_OPS ${TEST_OP})
+    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
 endforeach(TEST_OP)
-
-set_tests_properties(test_the_one_ps PROPERTIES TIMEOUT 50)
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index d08c1d41c89ec532f6c3124000f9bec38f9b86d7..0fd64b0d923051cb3d3b9bf9eb9388d062a54ff0 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -264,12 +264,16 @@ def parse_args():
         '--run_minimize', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--run_single_pass', type=int, default=0, help="test single pass")
+    parser.add_argument(
+        '--run_the_one_ps', type=int, default=0, help="test the_one_ps")
     parser.add_argument(
         '--debug_new_minimize', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--debug_new_pass', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--applied_pass_name', type=str, default="", help="test single pass")
+    parser.add_argument(
+        '--debug_the_one_ps', type=int, default=0, help="test the_one_ps")
 
     args = parser.parse_args()
     args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
@@ -280,9 +284,11 @@ def parse_args():
     config["pure_bf16"] = args.pure_bf16
     config['run_minimize'] = args.run_minimize
     config['run_single_pass'] = args.run_single_pass
+    config['run_the_one_ps'] = args.run_the_one_ps
     config['debug_new_minimize'] = args.debug_new_minimize
     config['debug_new_pass'] = args.debug_new_pass
     config['applied_pass_name'] = args.applied_pass_name
+    config['debug_the_one_ps'] = args.debug_the_one_ps
     yaml_helper.print_yaml(config)
     return config
 
@@ -344,15 +350,15 @@ class DnnTrainer(object):
             fleet_obj.minimize(loss)
 
         if fleet.is_server():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_server_main.prototxt'
             debug_program(_main_file, loss.block.program)
         elif fleet.is_worker():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
         elif self.role_maker._is_heter_worker():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config[
                     'debug_new_minimize']) + '_heter_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
@@ -376,6 +382,7 @@ class DnnTrainer(object):
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
             ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
                                          user_defined_strategy)
+            ps_optimizer._set_origin_programs([loss])
             ps_optimizer._init_ps_pass_context(loss, startup_program)
             _main = ps_optimizer.pass_ctx._attrs['cloned_main']
 
@@ -397,16 +404,84 @@ class DnnTrainer(object):
             _main = worker.append_send_ops_pass(_main, compiled_config)
 
         if fleet.is_server():
-            _main_file = '/' + sync_mode + "_" + str(config[
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_server_main.prototxt'
             debug_program(_main_file, _main)
         elif fleet.is_worker():
-            _main_file = '/' + sync_mode + "_" + str(config[
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_worker_main.prototxt'
             debug_program(_main_file, _main)
 
+    def run_the_one_ps(self):
+        self.init_fleet_with_gloo()
+        self.model = get_model(self.config)
+        self.input_data = self.model.create_feeds()
+        self.metrics = self.model.net(self.input_data)
+        loss = self.model._cost
+        user_defined_strategy = get_user_defined_strategy(self.config)
+        learning_rate = self.config.get(
+            "hyper_parameters.optimizer.learning_rate")
+        sync_mode = self.config.get("runner.sync_mode")
+        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
+
+        self.role_maker._generate_role()  # 必要
+        if self.config['debug_the_one_ps'] == 1:
+            logger.info("entering run_the_one_ps -- new")
+
+            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
+            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
+            ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
+                                         user_defined_strategy)
+            ps_optimizer.minimize_impl(loss)
+
+            from paddle.distributed.ps.the_one_ps import TheOnePSRuntime
+            _runtime_handle = TheOnePSRuntime()  # ps 目录下重构版的 TheOnePSRuntime
+            _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs)
+            if fleet.is_worker():
+                worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc(
+                )
+                with open(ps_log_root_dir + sync_mode + '_' +
+                          'new_worker_ps_desc', 'w') as f:
+                    f.write(worker_desc)
+            if fleet.is_server():
+                server_desc = _runtime_handle.ps_desc_builder.build_server_desc(
+                )
+                with open(ps_log_root_dir + sync_mode + '_' +
+                          'new_server_ps_desc', 'w') as f:
+                    f.write(server_desc)
+
+        else:
+            pass
+        '''          
+            logger.info("entering run_the_one_ps -- old")
+            fleet_obj = fleet.distributed_optimizer(
+                inner_optimizer, user_defined_strategy)  
+            fleet_obj.minimize(loss)  
+            if fleet.is_worker():
+                worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False)
+                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
+                with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f:
+                    f.write(str(worker_desc) + str(server_desc))
+            if fleet.is_server():
+                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
+                with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f:
+                    f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string()))
+        '''
+        if fleet.is_server():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_server_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+        elif fleet.is_worker():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+        elif self.role_maker._is_heter_worker():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_heter_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+
 
 if __name__ == "__main__":
     paddle.enable_static()
@@ -418,3 +493,5 @@ if __name__ == "__main__":
         benchmark_main.run_single_pass()
     elif config['run_minimize'] == 1:
         benchmark_main.run_minimize()
+    elif config['run_the_one_ps'] == 1:
+        benchmark_main.run_the_one_ps()
diff --git a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
old mode 100644
new mode 100755
index 78bae0e50c5807b3b009ee98512126d6e1111d34..8dddc6abd4cedf75ab8ff6228477e75049bc70e0
--- a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
+++ b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
@@ -22,16 +22,100 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 
+import paddle
+from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import *
+from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
+from ps_dnn_trainer import DnnTrainer
+from paddle.distributed.fleet.proto import ps_pb2
+from google.protobuf import text_format
+
 
-class TestTheOnePs(unittest.TestCase):
+class TestTheOnePs(PsPassTestBase):
     def setUp(self):
-        print('setUp...')
+        pass
 
     def tearDown(self):
-        print('tearDown...')
+        pass
 
-    def test_main(self):
+    def check(self, file1, file2):
         pass
+        '''
+        f = open(file1, "rb")
+        ps_desc_1 = ps_pb2.PSParameter()
+        text_format.Parse(f.read(), ps_desc_1)
+        f.close()
+
+        f = open(file2, "rb")
+        ps_desc_2 = ps_pb2.PSParameter()
+        text_format.Parse(f.read(), ps_desc_2)
+        f.close()
+        str1 = text_format.MessageToString(ps_desc_1)
+        str2 = text_format.MessageToString(ps_desc_2)
+        #logger.info('### msg10: {}'.format(str1))
+        #logger.info('### msg20: {}'.format(str2))
+        if str1 == str2:
+            return True
+        else:
+            return False
+        '''
+
+    def test_ps_cpu_async(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
+        self.config['run_the_one_ps'] = '1'
+
+        self.config['debug_the_one_ps'] = '0'
+        self.config[
+            'log_dir'] = ps_log_root_dir + "async_cpu_log_old_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        self.config['debug_the_one_ps'] = '1'
+        self.config[
+            'log_dir'] = ps_log_root_dir + "async_cpu_log_new_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        desc1 = '/ps_desc_baseline/async_worker_ps_desc'
+        desc2 = '/ps_log/async_new_worker_ps_desc'
+        desc3 = '/ps_desc_baseline/async_server_ps_desc'
+        desc4 = '/ps_log/async_new_server_ps_desc'
+        if self.check(desc1, desc2):
+            logger.info('test_ps_cpu_async ps_desc: worker passed!')
+        else:
+            logger.info('test_ps_cpu_async ps_desc: worker failed!')
+        if self.check(desc3, desc4):
+            logger.info('test_ps_cpu_async ps_desc: server passed!')
+        else:
+            logger.info('test_ps_cpu_async ps_desc: server failed!')
+
+    def test_ps_cpu_geo(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml"
+        self.config['run_the_one_ps'] = '1'
+
+        self.config['debug_the_one_ps'] = '0'
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        self.config['debug_the_one_ps'] = '1'
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        desc1 = '/ps_desc_baseline/geo_worker_ps_desc'
+        desc2 = '/ps_log/geo_new_worker_ps_desc'
+        desc3 = '/ps_desc_baseline/geo_server_ps_desc'
+        desc4 = '/ps_log/geo_new_server_ps_desc'
+        if self.check(desc1, desc2):
+            logger.info('test_ps_cpu_geo ps_desc: worker passed!')
+        else:
+            logger.info('test_ps_cpu_geo ps_desc: worker failed!')
+        if self.check(desc3, desc4):
+            logger.info('test_ps_cpu_geo ps_desc: server passed!')
+        else:
+            logger.info('test_ps_cpu_geo ps_desc: server failed!')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ps_dnn_model.py b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
index 0a147334dab264eb8846bcf216bfc38da1d43b02..8d91e0f4678cbd5a289b7cc0d60c66f62ca43396 100755
--- a/python/paddle/fluid/tests/unittests/ps_dnn_model.py
+++ b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
@@ -74,6 +74,7 @@ class DNNLayer(nn.Layer):
             else:
                 emb = self.embedding(s_input)
             emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
+            # emb.stop_gradient = True
             sparse_embs.append(emb)
 
         y_dnn = paddle.concat(x=sparse_embs + [dense_inputs], axis=1)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57f26776234eb65a57cc65df2ccd5a6a38a2144
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+import paddle.incubate.nn.functional as incubate_f
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import core
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedMultiHeadAttention(Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout_rate=0.5,
+                 attn_dropout_rate=0.5,
+                 kdim=None,
+                 vdim=None,
+                 normalize_before=False,
+                 need_weights=False,
+                 qkv_weight_attr=None,
+                 qkv_bias_attr=None,
+                 linear_weight_attr=None,
+                 linear_bias_attr=None,
+                 pre_ln_scale_attr=None,
+                 pre_ln_bias_attr=None,
+                 ln_scale_attr=None,
+                 ln_bias_attr=None,
+                 epsilon=1e-5,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedMultiHeadAttention, self).__init__()
+
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+                               "but recieved {}".format(num_heads))
+
+        self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.kdim = kdim
+        self.vdim = vdim
+        self.need_weights = need_weights
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert need_weights == False, "Only support need_weight is False now."
+
+        # tensor model parallel
+        assert num_heads % nranks == 0
+        num_heads = num_heads // nranks
+
+        self.qkv_weight = self.create_parameter(
+            shape=[3, num_heads, self.head_dim, embed_dim],
+            attr=qkv_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.qkv_bias = self.create_parameter(
+            shape=[3, num_heads, self.head_dim],
+            attr=qkv_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.linear_weight = self.create_parameter(
+            shape=[num_heads * self.head_dim, embed_dim],
+            attr=linear_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self.qkv_weight)
+            _set_var_distributed(self.qkv_bias)
+            # row parallel
+            _set_var_distributed(self.linear_weight)
+
+        if normalize_before:
+            self.pre_ln_scale = self.create_parameter(
+                attr=pre_ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.ln_scale = None
+            self.ln_bias = None
+        else:
+            self.pre_ln_scale = None
+            self.pre_ln_bias = None
+            self.ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+
+        self.dropout_rate = dropout_rate
+        self.attn_dropout_rate = attn_dropout_rate
+
+        self.name = name
+
+    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+        out = incubate_f.fused_multi_head_attention(
+            x=query,
+            qkv_weight=self.qkv_weight,
+            linear_weight=self.linear_weight,
+            pre_layer_norm=self.normalize_before,
+            pre_ln_scale=self.pre_ln_scale,
+            pre_ln_bias=self.pre_ln_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            pre_ln_epsilon=self._epsilon,
+            qkv_bias=self.qkv_bias,
+            linear_bias=self.linear_bias,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_rate,
+            attn_dropout_rate=self.attn_dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+n_head = 2 * MODEL_PARALLEL_SIZE
+d_key = 4
+hidden = n_head * d_key
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    pre_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    pre_ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    qkv_w = np.random.uniform(
+        -1, 1, size=(3, n_head, d_key, hidden)).astype(DTYPE)
+    qkv_b = np.random.uniform(-1, 1, size=(3, n_head, d_key)).astype(DTYPE)
+    linear_w = np.random.uniform(
+        -1, 1, size=(n_head * d_key, hidden)).astype(DTYPE)
+    linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else n_head // MODEL_PARALLEL_SIZE
+        end = start + n_head // MODEL_PARALLEL_SIZE
+        col_qkv_w = qkv_w[:, start:end, :, :]
+        col_qkv_b = qkv_b[:, start:end, :]
+        row_linear_w = linear_w[(start * d_key):(end * d_key), :]
+
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        result = attn(data)
+    else:
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr)
+        result = attn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, hidden]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f467da6a6465467dbf0c64122b6933df92a4cbc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -0,0 +1,384 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+
+def fused_feedforward(x,
+                      linear1_weight,
+                      linear2_weight,
+                      linear1_bias=None,
+                      linear2_bias=None,
+                      ln1_scale=None,
+                      ln1_bias=None,
+                      ln2_scale=None,
+                      ln2_bias=None,
+                      dropout1_rate=0.5,
+                      dropout2_rate=0.5,
+                      activation="relu",
+                      ln1_epsilon=1e-5,
+                      ln2_epsilon=1e-5,
+                      pre_layer_norm=False,
+                      training=True,
+                      mode='upscale_in_train',
+                      ring_id=-1,
+                      name=None):
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    helper = LayerHelper("fused_feedforward")
+    dtype = x.dtype
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'fused_feedforward')
+    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                'fused_feedforward')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    dropout1_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    dropout2_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+            'dropout1_is_test': not training,
+            'dropout2_is_test': not training,
+            'dropout1_fix_seed': seed is not None,
+            'dropout2_fix_seed': seed is not None,
+            'dropout1_seed': seed if seed is not None else 0,
+            'dropout2_seed': seed if seed is not None else 0,
+            'dropout1_implementation': mode,
+            'dropout2_implementation': mode,
+            'ring_id': ring_id,
+        })
+    return out
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedFeedForward(Layer):
+    def __init__(self,
+                 d_model,
+                 dim_feedforward,
+                 dropout_rate=0.1,
+                 epsilon=1e-05,
+                 activation="relu",
+                 act_dropout_rate=None,
+                 normalize_before=False,
+                 linear1_weight_attr=None,
+                 linear1_bias_attr=None,
+                 linear2_weight_attr=None,
+                 linear2_bias_attr=None,
+                 ln1_scale_attr=None,
+                 ln1_bias_attr=None,
+                 ln2_scale_attr=None,
+                 ln2_bias_attr=None,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedFeedForward, self).__init__()
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, but recieved {}".format(
+                d_model))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            format(dim_feedforward))
+
+        self._dtype = self._helper.get_default_dtype()
+        self._d_model = d_model
+
+        assert dim_feedforward % nranks == 0
+        dim_feedforward = dim_feedforward // nranks
+        self._dim_feedforward = dim_feedforward
+        self._dropout_rate = dropout_rate
+        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_method = activation
+        self._normalize_before = normalize_before
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self._linear1_weight = self.create_parameter(
+            shape=[d_model, dim_feedforward],
+            attr=linear1_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        self._linear2_weight = self.create_parameter(
+            shape=[dim_feedforward, d_model],
+            attr=linear2_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self._linear1_weight)
+            _set_var_distributed(self._linear1_bias)
+            _set_var_distributed(self._linear2_weight)
+
+        if normalize_before:
+            self._ln1_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln1_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
+            self._ln2_scale = None
+            self._ln2_bias = None
+        else:
+            self._ln1_bias = None
+            self._ln2_bias = None
+            self._ln2_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln2_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
+
+        self.name = name
+
+    def forward(self, src, cache=None):
+        out = fused_feedforward(
+            src,
+            self._linear1_weight,
+            self._linear2_weight,
+            self._linear1_bias,
+            self._linear2_bias,
+            self._ln1_scale,
+            self._ln1_bias,
+            self._ln2_scale,
+            self._ln2_bias,
+            dropout1_rate=self._act_dropout_rate,
+            dropout2_rate=self._dropout_rate,
+            activation=self._act_method,
+            ln1_epsilon=self._epsilon,
+            ln2_epsilon=self._epsilon,
+            pre_layer_norm=self._normalize_before,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    ln_w = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    ln_b = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    w0 = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    b0 = np.random.uniform(-1, 1, size=(OUT_SIZE, )).astype(DTYPE)
+    w1 = np.random.uniform(-1, 1, size=(OUT_SIZE, IN_SIZE)).astype(DTYPE)
+    b1 = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else OUT_SIZE // MODEL_PARALLEL_SIZE
+        end = start + OUT_SIZE // MODEL_PARALLEL_SIZE
+        col_w0 = w0[:, start:end]
+        col_b0 = b0[start:end]
+        row_w1 = w1[start:end, :]
+
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
+        w1_attr, b1_attr = get_param_attr(row_w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        #ffn.eval()
+        result = ffn(data)
+    else:
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(w0, b0)
+        w1_attr, b1_attr = get_param_attr(w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr)
+        #ffn.eval()
+        result = ffn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index d3d8fdd703148c184b6a06c5382655c3ed5ba044..5c40b898d2325ba97c42807ed77be91dc76aa623 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -183,6 +183,34 @@ class TestSigmoid(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSigmoidBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+        out = 1 / (1 + np.exp(-x))
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestSilu(TestActivation):
     def setUp(self):
         self.op_type = "silu"
@@ -945,6 +973,34 @@ class TestSqrt(TestActivation, TestParameter):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSqrtBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        np.random.seed(1023)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.sqrt(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestRsqrt(TestActivation):
     def setUp(self):
         self.op_type = "rsqrt"
@@ -983,7 +1039,7 @@ class TestAbs(TestActivation):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestCeil(TestActivation):
@@ -2195,6 +2251,34 @@ class TestSquare(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSquareBF16(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.square(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5)
+
+
 class TestPow(TestActivation):
     def setUp(self):
         self.op_type = "pow"
@@ -2433,6 +2517,35 @@ class TestSoftplus(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftplusBF16(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.init_dtype()
+
+        beta = 2
+        threshold = 15
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [10, 12]).astype(np.float32)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'beta': beta, "threshold": threshold}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.05)
+
+
 class TestSoftplusAPI(unittest.TestCase):
     # test paddle.nn.Softplus, paddle.nn.functional.softplus
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 4552d600bafd748633b45b9d25293026c5b0cf2e..2b281d7d6f7c5f186fd73ab3152a2e7652ae6ce1 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
         for k, v in self.get_strategy().items():
             setattr(build_strategy, k, v)
         self.check_before_applied(main2, startup2)
+
         apply_build_strategy(main2, startup2, build_strategy,
                              {"use_cuda": self.use_cuda})
         self.check_after_applied(main2, startup2)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index cce13a8bf3b74a7641710e853ff6c48e86ccba63..b02df024518a86fd152b89ae890b92f0f6df3b32 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
 
     def test_check_output(self):
         places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 
     def test_check_output(self):
         places = []
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 places.append(place)
-
         for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            #for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
                 self.check_with_place(place, data_format, self.dtype,
                                       [2, 3, 4, 5])
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
         places = [core.CPUPlace()]
 
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 6a6f85a48320681b430ab9d6a9363c28cf5c912e..c9abac8fb7946d51987645e11673f5d64a06c4ce 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -28,7 +28,7 @@ import paddle
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             with fluid.dygraph.guard(p):
@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_error(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             #paddle.disable_static()
@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
         else:
             paddle.set_default_dtype("float64")
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
 
     def tearDown(self):
@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
 class TestBatchNormUseGlobalStats(unittest.TestCase):
     def setUp(self):
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
         self.init_test()
 
@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index 6ae5424a882daea54145a31612f61909871fe05c..58baa0a2fa9443289f24a7e2f23e18fae4877f95 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_nccl.py')
 
+    def test_process_group_gloo(self):
+        self.run_mnist_2gpu('process_group_gloo.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 10b7e13dcc334dbc6b2f7b4c614cf888168c34ab..4feca1b92505b60713df245578896a9880b7cf06 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -25,6 +25,7 @@ import paddle
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
+        self.python_api = paddle.concat
         self.dtype = self.get_dtype()
         self.init_test_data()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index dc460cb16f68c14df2cd7f7f087c602b945ffc7d..ca77177125fcdddf198e6783939bf84b4ccd9b0e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index f933d5bf7a48f14b0f4cb4f7ce274744f28c4c24..892fa649a6c5b31b54db8204acc76a7cc8794136 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
 import unittest
+import paddle
 
 
 def _reverse_repeat_list(t, n):
@@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 8ea4e369d323619db35ebf41d4dd053fc5ffe4d9..6a9f7a47f66cce3879e813836745b3e609affd50 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle
 import paddle.fluid.core as core
@@ -603,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp):
         self.groups = 3
 
 
-#----------------Conv2DCUDNN----------------
+# #----------------Conv2DCUDNN----------------
 
 create_test_cudnn_class(TestConv2DOp)
 create_test_cudnn_class(TestWithPad)
@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
     TestWithDilation_AsyPadding, grad_check=False)
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 5f23d04dde51cc21e66098ee6e37027bf82d7537..8cf779ccfdd4292f4cb6cbe74bf58b8ee7b37411 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 def conv3d_forward_naive(input,
@@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 81c6aa1fd17d9ad16a1d24f32e5f55b3b71ca629..784d89b93f9859253a5722232954e7db1080afed 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase):
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
-        places = []
 
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
             [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
-        places = [fluid.CPUPlace()]
+        #places = [fluid.CPUPlace()]
+        places = []
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
@@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 686e738b8e0781ccc8d49edd551b5e8d64704181..6976019210283208e9583762ddd77867d1779e1a 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -93,11 +93,11 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.enable_dygraph()
 
-        gen = paddle.seed(12312321111)
+        paddle.seed(12312321111)
         x = paddle.randint(low=10, shape=[10], dtype="int32")
-        st1 = gen.get_state()
+        st1 = paddle.get_cuda_rng_state()
         x1 = paddle.randint(low=10, shape=[10], dtype="int32")
-        gen.set_state(st1)
+        paddle.set_cuda_rng_state(st1)
         x2 = paddle.randint(low=10, shape=[10], dtype="int32")
         paddle.seed(12312321111)
         x3 = paddle.randint(low=10, shape=[10], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index 4dab7c0df40763ac48d3a685b22edc7fba143ae9..b4854aea52a70bd5307193377c85ab229d949e1a 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -124,6 +124,25 @@ class TestDiagonalAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager(self):
+        paddle.disable_static(self.place)
+        with _test_eager_guard():
+            x_tensor = paddle.to_tensor(self.x)
+            out = paddle.diagonal(x_tensor)
+            out2 = paddle.diagonal(x_tensor, offset=0, axis1=2, axis2=1)
+            out3 = paddle.diagonal(x_tensor, offset=1, axis1=0, axis2=1)
+            out4 = paddle.diagonal(x_tensor, offset=0, axis1=1, axis2=2)
+        out_ref = np.diagonal(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
+        out2_ref = np.diagonal(self.x, offset=0, axis1=2, axis2=1)
+        self.assertEqual(np.allclose(out2.numpy(), out2_ref, rtol=1e-08), True)
+        out3_ref = np.diagonal(self.x, offset=1, axis1=0, axis2=1)
+        self.assertEqual(np.allclose(out3.numpy(), out3_ref, rtol=1e-08), True)
+        out4_ref = np.diagonal(self.x, offset=0, axis1=1, axis2=2)
+        self.assertEqual(np.allclose(out4.numpy(), out4_ref, rtol=1e-08), True)
+
+        paddle.enable_static()
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_api_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 345dad54132bc8c2d8520bc86c3276f651893e99..1ae780f488d2dc6bf37f88505a67723ea867dd94 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase):
 
     def test_dygraph(self):
         for place in self.places:
-            paddle.disable_static(place)
+            paddle.disable_static()
             x = paddle.to_tensor(self.input, place=place)
             if self.prepend is not None:
                 self.prepend = paddle.to_tensor(self.prepend, place=place)
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index f670f7c38097b39723e22d15d152590fa19c607d..fd2f642b770d646e74168800bbe8820534581354 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -933,5 +933,65 @@ class TestDropoutWithDeterminateSeedGenerator(unittest.TestCase):
             self.check_static_result(place=place)
 
 
+class TestDropoutBackward(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def cal_grad_upscale_train(self, mask, prob):
+        return mask.astype("float32") / (1 - prob)
+
+    def cal_grad_downscale_in_infer(self, mask):
+        return mask.astype("float32")
+
+    def test_backward_downscale_in_infer(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', 0.5)
+                out.backward()
+
+                self.assertTrue(
+                    np.array_equal(input.gradient(
+                    ), self.cal_grad_downscale_in_infer(mask.numpy())))
+
+    def test_backward_upscale_train(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.5
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+    def test_backward_upscale_train_2(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.3
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c296c7e40e981c63ac8fad76b1c3d3b5b0d9098
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphGroupSharded(TestMultipleGpus):
+
+    # check group sharded logic as well as the accuracy with single mode
+    def test_dygraph_group_sharded(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_api.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6a5d60ecae46c6b53b005963a26d92297607f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard, Variable
+from paddle.fluid import core
+from paddle.fluid.layers.utils import _hash_with_id
+import paddle.compat as cpt
+
+import unittest
+
+
+def _append_backward_desc(main_program, outs):
+    # make sure all status of is_test are False in train mode.
+    program = main_program.clone()
+    targets = []
+    for out in outs:
+        if isinstance(out, Variable):
+            targets.append(program.global_block().var(out.name))
+
+    if targets:
+        paddle.fluid.backward.gradients(targets=targets, inputs=[])
+
+    return program
+
+
+# def _set_grad_type(params, train_program):
+#     # NOTE: if user set sparse gradient mode, the param's gradient
+#     # will be SelectedRows, not LoDTensor. But tracer will just
+#     # set param grad VarBase by forward VarBase(LoDTensor)
+#     # If we don't change grad_var type here, RunProgramOp need
+#     # transform SelectedRows to LoDTensor forcibly, it may not
+#     # be user wanted result.
+#     for param in params:
+#         grad_name = param.name + core.grad_var_suffix()
+#         grad_var = train_program.desc.block(0).find_var(
+#             cpt.to_bytes(grad_name))
+#         # NOTE: cannot find var desc maybe no problem, such as in batch_norm
+#         if grad_var is None:
+#             continue
+#         param._set_grad_type(grad_var.type())
+
+
+def _create_out(var):
+    assert isinstance(var, Variable)
+    var_desc = var.desc
+    varbase = None
+    if not core._in_eager_mode():
+        var_base = core.VarBase(var_desc.dtype(),
+                                var_desc.shape(),
+                                var_desc.name(), var_desc.type(), False)
+    else:
+        var_base = core.eager.Tensor(var_desc.dtype(),
+                                     var_desc.shape(),
+                                     var_desc.name(), var_desc.type(), False)
+    return var_base
+
+
+class TestRunProgram(unittest.TestCase):
+    def test_eager(self):
+        paddle.set_device('cpu')
+        paddle.enable_static()
+        # step 1: construct program
+        x = paddle.static.data(shape=[2, 4], name='x')
+        x.stop_gradient = False
+        y = paddle.static.data(shape=[4, 2], name='y')
+        y.stop_gradient = False
+        out = paddle.matmul(x, y)
+
+        main_program = paddle.static.default_main_program()
+        program = _append_backward_desc(main_program, [out])
+
+        paddle.disable_static('cpu')
+        # step 2: call run_program in eager mode
+        with _test_eager_guard():
+            x_t = paddle.ones([2, 4])
+            x_t.name = "x"
+            x_t.stop_gradient = False
+            y_t = paddle.ones([4, 2])
+            y_t.name = "y"
+            y_t.stop_gradient = False
+
+            fake_var = paddle.zeros([1])
+            fake_var.name = 'Fake_var'
+
+            out_t = _create_out(out)
+
+            scope = core.Scope()
+            attrs = ('global_block', program.desc.block(0), 'start_op_index', 0,
+                     'end_op_index', main_program.desc.block(0).op_size(),
+                     'is_test', False, 'program_id', _hash_with_id(program))
+
+            _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
+                               [fake_var], *attrs)
+
+            loss = paddle.mean(out_t)
+            loss.backward()
+
+            self.assertTrue(np.array_equal(np.ones([2, 2]) * 4, out_t.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([2, 4]) * 0.5, x_t.grad.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([4, 2]) * 0.5, y_t.grad.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 252482fa6d270edbc1bec3a0d6023933521d7f7e..156fdcb9b0abe1ea2dcca0e15bbcfec87b8ebf7a 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
             self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
             ori_place = egr_tensor.place
 
-            new_arr = np.random.rand(4, 4, 16, 32).astype('float32')
+            new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
             self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
 
-            egr_tensor._set_value(new_arr)
+            egr_tensor.set_value(new_arr)
             self.assertEqual(egr_tensor.stop_gradient, True)
             self.assertTrue(egr_tensor.place._equals(ori_place))
-            self.assertEqual(egr_tensor.shape, [4, 4, 16, 32])
+            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
             self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
 
 
@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
             new_weight = np.ones([1, 3]).astype('float32')
             self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
 
-            linear.weight._set_value(new_weight)
+            linear.weight.set_value(new_weight)
             self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
             self.assertTrue(linear.weight.place._equals(ori_place))
 
diff --git a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
index db02372267677dd736dad05b1a0272e6312a5a6e..93745d9561f5d256122d6dcd4d256b6ebb3c24d0 100644
--- a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
@@ -60,8 +60,12 @@ class TestEigvalshGPUCase(unittest.TestCase):
         self.dtype = "float32"
         np.random.seed(123)
         self.x_np = np.random.random(self.x_shape).astype(self.dtype)
-        self.rtol = 1e-5
-        self.atol = 1e-5
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
 
     def test_check_output_gpu(self):
         if paddle.is_compiled_with_cuda():
@@ -75,23 +79,29 @@ class TestEigvalshGPUCase(unittest.TestCase):
 
 class TestEigvalshAPI(unittest.TestCase):
     def setUp(self):
-        self.init_input_shape()
+        self.x_shape = [5, 5]
         self.dtype = "float32"
         self.UPLO = 'L'
-        self.rtol = 1e-6
-        self.atol = 1e-6
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
         self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
             else paddle.CPUPlace()
         np.random.seed(123)
+        self.init_input_data()
+
+    def init_input_data(self):
         self.real_data = np.random.random(self.x_shape).astype(self.dtype)
-        self.complex_data = np.random.random(self.x_shape).astype(
+        complex_data = np.random.random(self.x_shape).astype(
             self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype)
         self.trans_dims = list(range(len(self.x_shape) - 2)) + [
             len(self.x_shape) - 1, len(self.x_shape) - 2
         ]
-
-    def init_input_shape(self):
-        self.x_shape = [5, 5]
+        self.complex_symm = np.divide(
+            complex_data + np.conj(complex_data.transpose(self.trans_dims)), 2)
 
     def compare_result(self, actual_w, expected_w):
         np.testing.assert_allclose(
@@ -122,9 +132,9 @@ class TestEigvalshAPI(unittest.TestCase):
             output_w = paddle.linalg.eigvalsh(input_x)
             exe = paddle.static.Executor(self.place)
             expected_w = exe.run(main_prog,
-                                 feed={"input_x": self.complex_data},
+                                 feed={"input_x": self.complex_symm},
                                  fetch_list=[output_w])
-            actual_w = np.linalg.eigvalsh(self.complex_data)
+            actual_w = np.linalg.eigvalsh(self.complex_symm)
             self.compare_result(actual_w, expected_w[0])
 
     def test_in_static_mode(self):
@@ -139,14 +149,14 @@ class TestEigvalshAPI(unittest.TestCase):
         actual_w = paddle.linalg.eigvalsh(input_real_data)
         self.compare_result(actual_w, expected_w)
 
-        input_complex_data = paddle.to_tensor(self.complex_data)
-        expected_w = np.linalg.eigvalsh(self.complex_data)
-        actual_w = paddle.linalg.eigvalsh(input_complex_data)
+        input_complex_symm = paddle.to_tensor(self.complex_symm)
+        expected_w = np.linalg.eigvalsh(self.complex_symm)
+        actual_w = paddle.linalg.eigvalsh(input_complex_symm)
         self.compare_result(actual_w, expected_w)
 
     def test_eigvalsh_grad(self):
         paddle.disable_static(self.place)
-        x = paddle.to_tensor(self.complex_data, stop_gradient=False)
+        x = paddle.to_tensor(self.complex_symm, stop_gradient=False)
         w = paddle.linalg.eigvalsh(x)
         (w.sum()).backward()
         np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index aee6ca249f535b9c06c00a6806ac491be16cd4b3..a204c26c1b823fa228dba63dc8351db0adf31708 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 5bc2d1cda180bac91ada2a041903888b94e26827..95537d4332739be13ee1705dafeea6d3f0ac2762 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -21,7 +21,7 @@ from paddle.fluid import Program, program_guard
 import paddle.compat as cpt
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 
 class TestFillAnyLikeOp(OpTest):
@@ -47,6 +47,25 @@ class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
         self.value = 0.0
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillAnyLikeOpBfloat16(OpTest):
+    def setUp(self):
+        self.op_type = "fill_any_like"
+        self.dtype = np.uint16
+        self.value = 0.0
+        self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)}
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {
+            'Out':
+            convert_float_to_uint16(self.value * np.ones_like(self.inputs["X"]))
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
     def init(self):
         self.value = 1.0
@@ -79,19 +98,6 @@ class TestFillAnyLikeOpType(TestFillAnyLikeOp):
         }
 
 
-class TestFillAnyLikeOpOverflow(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e100
-
-    def test_check_output(self):
-        exception = None
-        try:
-            self.check_output(check_dygraph=False)
-        except ValueError as ex:
-            exception = ex
-        self.assertIsNotNone(exception)
-
-
 class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
     def init(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 822c952893e11861cb1febf2d29b09e73ffa486e..15071b2b6aa69fe95eb40acc9db3c94ee81c0256 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -83,6 +83,27 @@ class TestFillConstantOp4(OpTest):
         self.check_output()
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillConstantBF16Op(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.dtype = np.uint16
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 3.8,
+            'dtype': core.VarDesc.VarType.BF16
+        }
+        self.outputs = {'Out': convert_float_to_uint16(np.full((123, 92), 3.8))}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 14a59b413383f81959e9854d5735a87f7ff728cc..44b94cd3b66eec55841760a14fd95df0e930f36e 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -174,6 +174,15 @@ class TestFoldOpError(unittest.TestCase):
                     x, output_sizes=[6, 6], kernel_sizes=[2, 2],
                     strides=[1, 1])
 
+            def test_output_size_2():
+                # out_size must GT 1
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[0.1, 0.2],
+                    kernel_sizes=[2, 2],
+                    strides=[1, 1])
+
             def test_block_h_w():
                 # test_block_h_w GT 0
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
@@ -196,6 +205,7 @@ class TestFoldOpError(unittest.TestCase):
             self.assertRaises(AssertionError, test_dilations_shape)
             self.assertRaises(AssertionError, test_strides_shape)
             self.assertRaises(ValueError, test_output_size)
+            self.assertRaises(ValueError, test_output_size_2)
             self.assertRaises(ValueError, test_block_h_w)
             self.assertRaises(ValueError, test_GT_0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index be6abb17c3c316d30bdcfa5539ecd1e2549280a5..3ae2e9ff6bdaf7d5a4161eb70914191ace7df417 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -62,6 +62,15 @@ class TestFullOp(unittest.TestCase):
         self.assertTrue((out.numpy() == out_numpy).all(), True)
         paddle.enable_static()
 
+    def test_full_like_fill_inf(self):
+        paddle.disable_static()
+        input = paddle.arange(6, 10, dtype='float32')
+        out = paddle.full_like(input, fill_value=float('inf'))
+        out_numpy = np.random.random((4)).astype("float32")
+        out_numpy.fill(float('inf'))
+        self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.enable_static()
+
 
 class TestFullOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py
index d45ef528261f394ff54e94714aa40d345b9aa458..55981b01c408435d2824ca46dc62b9b4d0c15651 100644
--- a/python/paddle/fluid/tests/unittests/test_function_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_function_hook.py
@@ -20,6 +20,7 @@ import numpy as np
 
 import paddle.fluid.core as core
 from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestCapture:
@@ -41,7 +42,7 @@ def grad_hook(grad):
 
 
 class TestBakcwardFunctionHookError(unittest.TestCase):
-    def test_hook(self):
+    def func_hook(self):
         input_data = np.ones([4, 4]).astype('float32')
 
         x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False)
@@ -58,6 +59,12 @@ class TestBakcwardFunctionHookError(unittest.TestCase):
 
         assert test_cap.list == [1, 2, 1]
 
+    def test_hook(self):
+        # _register_void_function_post_hook do not support in eager mode
+        with _test_eager_guard():
+            pass
+        self.func_hook()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index cec48724da2fe08d0eefcdf0d2df3c54c9aa363d..8e0a744ecdbdabac1a248ed1f0a0d08934749e55 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 8ccaf30cbdb34bcd215bd6b76431c7a6acfeaa3e..6c208160658820d57456e46f86b76659f4e4f80d 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f3180e21d8c63dd3fbc87d58c01f43422a01bcb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cases for role makers."""
+
+from __future__ import print_function
+import paddle
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+
+
+def compare(ref, res, atol, rtol):
+
+    ref = np.array(ref).flatten()
+    res = np.array(res).flatten()
+
+    tmp_ref = ref.astype(np.float)
+    tol = atol + rtol * abs(tmp_ref)
+
+    diff = abs(res - ref)
+
+    indices = np.transpose(np.where(diff > tol))
+    if len(indices) == 0:
+        return True
+    return False
+
+
+def verify_node_count(graph, node_name, target_count):
+    count = 0
+    for node in graph.nodes():
+        if node.name() == node_name:
+            count += 1
+    return count == target_count
+
+
+class MultiFCLayer(paddle.nn.Layer):
+    def __init__(self, hidden, Activation):
+        super(MultiFCLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(hidden, hidden)
+        self.linear2 = paddle.nn.Linear(hidden, hidden)
+        self.linear3 = paddle.nn.Linear(hidden, hidden)
+
+        self.relu1 = Activation()
+        self.relu2 = Activation()
+        self.relu3 = Activation()
+
+    def forward(self, x, matmul_y, ele_y):
+        output = self.linear1(x)
+        output = self.relu1(output)
+        output = self.linear2(output)
+
+        output1 = paddle.matmul(output, matmul_y)
+        output = self.linear3(output)
+        output = self.relu2(output)
+
+        output = paddle.matmul(output, matmul_y)
+        output = paddle.add(output, ele_y)
+        output = self.relu3(output)
+        output = paddle.add(output, output1)
+        return output
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueFWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+        self.reference = self.exe.run(self.main_prog,
+                                      feed=self.feed,
+                                      fetch_list=[self.loss.name])
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        result = self.exe.run(program,
+                              feed=self.feed,
+                              fetch_list=[self.loss.name])
+        self.assertTrue(
+            compare(self.reference, result, self.atol, self.rtol),
+            "[{}] outputs are miss-matched.".format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        act_fwd_name = self._get_act_type()[1]
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueBWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+                paddle.static.append_backward(loss=self.loss)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+
+        self.fetch = [
+            self.loss.name,
+            '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear3.full_name())
+        ]
+        self.outs_ref = self.exe.run(self.main_prog,
+                                     feed=self.feed,
+                                     fetch_list=self.fetch)
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch)
+
+        for ref, res in zip(self.outs_ref, outs_res):
+            self.assertTrue(
+                compare(ref, res, self.atol, self.rtol),
+                "[{}] output is miss-matched.".format(type(self).__name__))
+
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3),
+            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        _, act_fwd_name, act_bwd_name = self._get_act_type()
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+        self.assertTrue(
+            verify_node_count(program._graph, act_bwd_name, 2),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_bwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 5e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu", "gelu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index 9b739ebdfb23c680a86a54a3fa00398805ee8968..d391b04aa4772efbf7fadb7a9556aafd445197db 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 443703aa937d8aead8307b892961e7054ede6ed4..a3ae2a20dba23ef39510e962b148d40364f85e72 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -70,10 +70,12 @@ class TestFusedAttentionOp(OpTest):
         self.attn_mask_type = np.float64
         self.pre_layer_norm = False
         self.has_attn_mask = True
+        self.has_cache_kv = False
         self.training = True
 
         self.batch_size = 8
         self.query_length = 128
+        self.cache_length = 128
         self.head_dim = 64
         self.num_heads = 16
         self.embed_dim = self.head_dim * self.num_heads
@@ -88,10 +90,22 @@ class TestFusedAttentionOp(OpTest):
     def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
+        out_seq_len = self.key_length
+        if self.has_cache_kv:
+            assert self.training is False, ValueError(
+                'cache_kv can only used in inference')
+            self.cache_kv = np.random.rand(2, self.batch_size, self.num_heads,
+                                           self.cache_length,
+                                           self.head_dim).astype(self.x_type)
+            out_seq_len += self.cache_length
+        else:
+            self.cache_kv = None
+
         if self.has_attn_mask:
+            # [B, n_head, seq_len, out_seq_len]
             self.attn_mask = np.ones(
                 (self.batch_size, self.num_heads, self.query_length,
-                 self.key_length),
+                 out_seq_len),
                 dtype=self.attn_mask_type)
             if self.attn_mask_type == np.int64:
                 self.attn_mask = np.tril(self.attn_mask)
@@ -110,6 +124,11 @@ class TestFusedAttentionOp(OpTest):
     def GetBaselineOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
         tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
+
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
+
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -130,6 +149,18 @@ class TestFusedAttentionOp(OpTest):
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
         v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
+        if self.has_cache_kv:
+            # [1, B, n_head, cache_seq_len, head_dim]
+            cache_k, cache_v = paddle.split(cache_kv, 2)
+            cache_k = paddle.squeeze(cache_k, axis=0)
+            cache_v = paddle.squeeze(cache_v, axis=0)
+            # [B, n_head, cache_seq_len + seq_len, head_dim]
+            # out_seq_len = cache_seq_len + seq_len
+            k_out = paddle.concat([cache_k, k_out], axis=-2)
+            v_out = paddle.concat([cache_v, v_out], axis=-2)
+
+        # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
+        # --> [B, n_head, seq_len, out_seq_len]
         qk_out = layers.matmul(
             x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
 
@@ -146,6 +177,8 @@ class TestFusedAttentionOp(OpTest):
                 self.dropout_prob,
                 training=self.training,
                 mode="upscale_in_train")
+            # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
+            # --> [B, n_head, seq_len, head_dim]
             qktv_out = tensor.matmul(dropout_out, v_out)
         else:
             qktv_out = tensor.matmul(softmax_out, v_out)
@@ -160,6 +193,10 @@ class TestFusedAttentionOp(OpTest):
             final_out = self.norm1(residual_out)
         else:
             final_out = residual_out
+
+        if self.has_cache_kv:
+            return final_out
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, tensor_query.grad
@@ -206,6 +243,9 @@ class TestFusedAttentionOp(OpTest):
             (3, self.num_heads, self.head_dim, self.embed_dim))
 
         x = paddle.to_tensor(self.query, stop_gradient=False)
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -219,8 +259,12 @@ class TestFusedAttentionOp(OpTest):
         final_out = incubate_f.fused_multi_head_attention(
             x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm,
             ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor,
-            out_linear_bias, attn_mask, self.dropout_prob,
+            out_linear_bias, cache_kv, attn_mask, self.dropout_prob,
             self.attn_dropout_prob, ln2_epsilon)
+
+        if self.has_cache_kv:
+            return final_out[0], final_out[1]
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, x.grad
@@ -236,114 +280,27 @@ class TestFusedAttentionOp(OpTest):
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
+        super().config()
         self.bias_attr = False
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
         self.has_attn_mask = False
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
     def config(self):
+        super().config()
         self.x_type = np.float16
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
 
     def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
@@ -354,5 +311,21 @@ class TestFusedAttentionOpFp16(TestFusedAttentionOp):
             x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
 
 
+class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.training = False
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+
+    def test_fused_attention_op(self):
+        with paddle.no_grad():
+            final_out_ref = self.GetBaselineOut()
+            final_out, cache_kv_out = self.GetFusedAttentionOut()
+            np.testing.assert_allclose(
+                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea1bf2e9cb8105280a4f2635279518d125a4312
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def get_outputs(DOut, X, Y):
+    DX = np.dot(DOut, Y.T)
+    DY = np.dot(X.T, DOut)
+    DBias = np.sum(DOut, axis=0)
+
+    return DX, DY, DBias
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                    self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP32(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP64(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                   self.inputs['Y'])
+        self.outputs = {'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP32(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP64(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                               self.inputs['Y'])
+        self.outputs = {'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f826898f9e5dd601b54eaeb1c54216414a70246b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def gelu(x):
+    y_ref = 0.5 * x * (
+        1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    return y_ref.astype(x.dtype)
+
+
+def relu(x):
+    mask = x > 0
+    return x * mask
+
+
+def get_output(X, Y, bias, act):
+    out = np.dot(X, Y) + bias
+    if act == 'relu':
+        return relu(out)
+    elif act == 'gelu':
+        return gelu(out)
+    else:
+        return out
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (-1, 4)), self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (4, -1)).T, self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'gelu'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'gelu')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'none')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 83b39a62f152d2c7e02abe313ffeeafe017d033d..978a3d86d882a2e0d59e8244a956f5c97a4bd9ef 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 from paddle.framework import core
@@ -117,6 +117,39 @@ class TestCase6(TestGatherOp):
         self.index_type = "int32"
 
 
+class TestGatherBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            'X': convert_float_to_uint16(xnp),
+            'Index': index_np,
+            'Axis': axis_np
+        }
+        out = gather_numpy(self.inputs['X'], index_np, axis_np[0])
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.5)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
 class TestGatherOp1(OpTest):
     def setUp(self):
         self.op_type = "gather"
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 31caf4bd6be984e2d1579f7756c8ba0979db832f..738441a46d377ec920c911c2712bcdc7bab6dbf0 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,7 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float
 import paddle
 
 
@@ -65,6 +65,50 @@ class TestGaussianRandomOp(OpTest):
             "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestGaussianRandomBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.set_attrs()
+        self.inputs = {}
+        self.use_mkldnn = False
+        self.attrs = {
+            "shape": [123, 92],
+            "mean": self.mean,
+            "std": self.std,
+            "seed": 10,
+            "dtype": paddle.fluid.core.VarDesc.VarType.BF16,
+            "use_mkldnn": self.use_mkldnn
+        }
+        paddle.seed(10)
+
+        self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
+
+    def set_attrs(self):
+        self.mean = 1.0
+        self.std = 2.
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(
+            self.verify_output, place=core.CUDAPlace(0))
+
+    def verify_output(self, outs):
+        outs = convert_uint16_to_float(outs)
+        self.assertEqual(outs[0].shape, (123, 92))
+        hist, _ = np.histogram(outs[0], range=(-3, 5))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        data = np.random.normal(size=(123, 92), loc=1, scale=2)
+        hist2, _ = np.histogram(data, range=(-3, 5))
+        hist2 = hist2.astype("float32")
+        hist2 /= float(outs[0].size)
+        self.assertTrue(
+            np.allclose(
+                hist, hist2, rtol=0, atol=0.05),
+            "hist: " + str(hist) + " hist2: " + str(hist2))
+
+
 class TestMeanStdAreInt(TestGaussianRandomOp):
     def set_attrs(self):
         self.mean = 1
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 5cb72512f99af7b4948e9fe4c01e9b993c1e247e..2011a35db682e53b851cbcf3eb71cbc3706fd7c4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -1156,7 +1156,7 @@ class TestBf16(unittest.TestCase):
                         out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
 
 
-class TestPyLayerWithAmp(unittest.TestCase):
+class TestAmpWithPyLyer(unittest.TestCase):
     def test_pylayer(self):
         class MyMM(PyLayer):
             @staticmethod
@@ -1168,7 +1168,7 @@ class TestPyLayerWithAmp(unittest.TestCase):
             def backward(ctx, grad):
                 a, b = ctx.saved_tensor()
                 # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
-                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent
+                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent before.
                 return grad.mm(b.t()), a.t().mm(grad)
 
         x = paddle.rand([10, 10])
@@ -1182,5 +1182,39 @@ class TestPyLayerWithAmp(unittest.TestCase):
         loss.backward()
 
 
+class TestAmpWithHook(unittest.TestCase):
+    def test_hook_change_dtype(self):
+        with paddle.fluid.dygraph.guard():
+            v = paddle.rand([3, 3])
+            v.stop_gradient = False
+
+            def foo(grad):
+                print('grad', grad, grad.dtype)  # grad's dtype is float32
+                res = paddle.mm(grad, grad)  # mm runs in fp16
+                print('res', res, res.dtype)  # res's dtype is float16
+                return res
+
+            v.register_hook(foo)
+            with paddle.amp.auto_cast():
+                a = paddle.mm(v, v)
+                loss = a.sum()
+                self.assertRaises(RuntimeError, loss.backward)
+
+    def test_hook_change_place(self):
+        with paddle.fluid.dygraph.guard():
+            v = paddle.rand([3, 3])
+            v.stop_gradient = False
+
+            def foo(grad):
+                res = grad.cpu()  # change place
+                return res
+
+            v.register_hook(foo)
+            with paddle.amp.auto_cast():
+                a = paddle.mm(v, v)
+                loss = a.sum()
+                self.assertRaises(RuntimeError, loss.backward)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index 04a0e5e4cd10f7ece370e879986056d508c894ff..3e222e3c658ecd105811f3694a25d20f1826bcda 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -24,6 +24,7 @@ import paddle.fluid.core as core
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph import Linear
+from paddle.fluid.framework import _test_eager_guard
 
 # Can use Amusic dataset as the DeepCF describes.
 DATA_PATH = os.environ.get('DATA_PATH', '')
@@ -294,9 +295,42 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('dynamic loss: %s %s\n' %
                                      (slice, dy_loss2))
 
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                paddle.seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+
+                deepcf = DeepCF(num_users, num_items, matrix)
+                adam = fluid.optimizer.AdamOptimizer(
+                    0.01, parameter_list=deepcf.parameters())
+
+                for e in range(NUM_EPOCHES):
+                    sys.stderr.write('epoch %d\n' % e)
+                    for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                        if slice + BATCH_SIZE >= users_np.shape[0]:
+                            break
+                        prediction = deepcf(
+                            to_variable(users_np[slice:slice + BATCH_SIZE]),
+                            to_variable(items_np[slice:slice + BATCH_SIZE]))
+                        loss = fluid.layers.reduce_sum(
+                            fluid.layers.log_loss(prediction,
+                                                  to_variable(
+                                                      labels_np[slice:slice +
+                                                                BATCH_SIZE])))
+                        loss.backward()
+                        adam.minimize(loss)
+                        deepcf.clear_gradients()
+                        eager_loss = loss.numpy()
+                        sys.stderr.write('eager loss: %s %s\n' %
+                                         (slice, eager_loss))
+
         self.assertEqual(static_loss, dy_loss)
         self.assertEqual(static_loss, dy_loss2)
+        self.assertEqual(static_loss, eager_loss)
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index c813aeede6fe4555ececea9f7e00a479226f2d27..a5a90461551ff868b6d15a5fa3b9de2850cb460a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -23,6 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid.optimizer import AdamOptimizer
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 def gen_data():
@@ -60,7 +61,7 @@ class GCN(fluid.Layer):
 
 
 class TestDygraphGNN(unittest.TestCase):
-    def test_gnn_float32(self):
+    def func_gnn_float32(self):
         paddle.seed(90)
         paddle.framework.random._manual_program_seed(90)
         startup = fluid.Program()
@@ -168,6 +169,11 @@ class TestDygraphGNN(unittest.TestCase):
         self.assertTrue(np.allclose(static_weight, model2_gc_weight_value))
         sys.stderr.write('%s %s\n' % (static_loss, loss_value))
 
+    def test_gnn_float32(self):
+        with _test_eager_guard():
+            self.func_gnn_float32()
+        self.func_gnn_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py
index f96358096516e67af6269c321a2722c500489959..89535797ed09890df44939efbc531df53d710304 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_group.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py
@@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph
 from paddle.fluid.dygraph.nn import Linear
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-
-
-class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MLP, self).__init__()
-
-        self._linear1 = Linear(784, 10)
-        self._linear2 = Linear(10, 10)
-
-    def forward(self, inputs):
-        y = self._linear1(inputs)
-        y = self._linear2(y)
-        return y
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDataParallelGroup(unittest.TestCase):
-    def create_varbase(self, dtype, shape,
-                       type=core.VarDesc.VarType.LOD_TENSOR):
-        return core.VarBase(dtype, shape, "", type, True)
+    def create_varbase(self, dtype, shape):
+        return paddle.rand(shape=shape, dtype=dtype)
+
+    def assign_group_by_size(self, *args):
+        return core.assign_group_by_size(*args)
 
     def test_construct_group0(self):
         # one dtype & one limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 100]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400])
         self.assertEqual([[0], [1], [2], [3]], res)
 
     def test_construct_group1(self):
         # multi dtype & one limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [400])
         self.assertEqual([[0, 2], [1, 3], [4], [5]], res)
 
     def test_construct_group2(self):
         # one dtype & multi limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400, 800])
         self.assertEqual([[0], [1, 2], [3]], res)
 
     def test_construct_group3(self):
         # multi dtype & multi limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [200, 400])
         self.assertEqual([[0], [1], [2, 4], [3, 5]], res)
 
     def test_construct_group4(self):
         # multi dtype & zero limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [0])
         self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
 
     def test_construct_group5(self):
         # multi dtype & infinite capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [10000])
         self.assertEqual([[0, 2, 4], [1, 3, 5]], res)
 
     def test_construct_group6(self):
         # multi dtype & limit capability & multi tensor type
         var_list = []
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase(
+            "float32",
+            [1, 50], ))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [True, False, False, False, False, True], [400])
         self.assertEqual([[0], [1, 3], [2, 4], [5]], res)
 
     def test_construct_group7(self):
         # multi dtype & multi limit capability & multi tensor type
         var_list = []
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [True, False, False, False, False, True], [200, 400])
         self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
 
     def test_construct_group8(self):
         # one dtype & one limit capability & have tensor_indices
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 100]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400], [3, 0, 1, 2])
         self.assertEqual([[3, 0], [1], [2]], res)
 
     def test_construct_group9(self):
         # one dtype & one limit capability & have tensor_indices
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000]))
-        res = core.assign_group_by_size(var_list, [False, False, False, True],
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 1000]))
+        res = self.assign_group_by_size(var_list, [False, False, False, True],
                                         [300], [1, 0, 2, 3])
         self.assertEqual([[1, 0], [3], [2]], res)
 
 
+class TestDataParallelGroupEager(TestDataParallelGroup):
+    def create_varbase(self, dtype, shape):
+        with _test_eager_guard():
+            return paddle.rand(shape=shape, dtype=dtype)
+
+    def assign_group_by_size(self, *args):
+        return core.eager_assign_group_by_size(*args)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index d54194164a559a743bc5740b2cb941d2ad05fcdd..110bb961bbe127e23e9a50a823ba3c0c8042ec20 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -207,4 +207,5 @@ class TestDygraphSimpleNet(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 973c559857974a66ee1c06fc02305adf29c8f0ad..09868520b4c286c0d6b1f7c1cae160be5796199e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -22,6 +22,7 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Config(object):
@@ -371,7 +372,7 @@ class OCRAttention(fluid.dygraph.Layer):
 
 
 class TestDygraphOCRAttention(unittest.TestCase):
-    def test_while_op(self):
+    def test_ocr_test(self):
         seed = 90
         epoch_num = 1
         if core.is_compiled_with_cuda():
@@ -400,7 +401,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
                 i * Config.max_length,
                 dtype='int64').reshape([1, Config.max_length])))
 
-        with fluid.dygraph.guard():
+        def run_dygraph():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -452,6 +453,16 @@ class TestDygraphOCRAttention(unittest.TestCase):
                     for param in ocr_attention.parameters():
                         dy_param_value[param.name] = param.numpy()
 
+            return dy_out, dy_param_init_value, dy_param_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value = run_dygraph()
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -537,6 +548,17 @@ class TestDygraphOCRAttention(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05))
 
+        # check eager here
+        self.assertTrue(np.allclose(static_out, eager_out))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.array_equal(value, eager_param_init_value[key]))
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(
+                np.allclose(
+                    value, eager_param_value[key], rtol=1e-05))
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index e5453eed136c20f4effe5a8b81292ca9a37f4929..f659d8343543310493b979334f32a2c9f5e19c8d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -26,10 +26,11 @@ from test_imperative_base import new_program_scope
 from test_imperative_ptb_rnn import PtbModel
 import numpy as np
 import six
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDygraphPtbRnnSortGradient(unittest.TestCase):
-    def test_ptb_rnn_sort_gradient(self):
+    def func_ptb_rnn_sort_gradient(self):
         for is_sparse in [True, False]:
             self.ptb_rnn_sort_gradient_cpu_float32(is_sparse)
 
@@ -171,6 +172,11 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
         for key, value in six.iteritems(static_param_updated):
             self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
+    def test_ptb_rnn_sort_gradient(self):
+        with _test_eager_guard():
+            self.func_ptb_rnn_sort_gradient()
+        self.func_ptb_rnn_sort_gradient()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
index d0b3adc490945377a4dd05f9c414cd5c35c7fae5..f12ca0a93ffd9441761c2da866c2c811a30c6e68 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
@@ -16,9 +16,11 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
@@ -60,6 +62,25 @@ class TestRecurrentFeed(unittest.TestCase):
                 original_in1.stop_gradient = True
                 rt.clear_gradients()
 
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+                original_in1 = to_variable(original_np1)
+                original_in2 = to_variable(original_np2)
+                original_in1.stop_gradient = False
+                original_in2.stop_gradient = False
+                rt = RecurrentTest("RecurrentTest")
+
+                for i in range(3):
+                    sum_out, out = rt(original_in1, original_in2)
+                    original_in1 = out
+                    eager_sum_out_value = sum_out.numpy()
+                    sum_out.backward()
+                    eager_dyout = out.gradient()
+                    original_in1.stop_gradient = True
+                    rt.clear_gradients()
+
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -88,8 +109,11 @@ class TestRecurrentFeed(unittest.TestCase):
                 original_np1 = static_out_value
 
         self.assertTrue(np.array_equal(static_sum_out, sum_out_value))
+        self.assertTrue(np.array_equal(static_sum_out, eager_sum_out_value))
         self.assertTrue(np.array_equal(static_dout, dyout))
+        self.assertTrue(np.array_equal(static_dout, eager_dyout))
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index a89628c594de9f7babd6a82ba178ba00829fccfe..08320d04d99964e61378076f05d71154940cd578 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Policy(fluid.dygraph.Layer):
@@ -63,7 +64,7 @@ class TestImperativeMnist(unittest.TestCase):
         mask_list = [[0, 1]]
         mask = np.array(mask_list).astype("float32")
 
-        with fluid.dygraph.guard():
+        def run_dygraph():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
@@ -104,6 +105,16 @@ class TestImperativeMnist(unittest.TestCase):
             for param in policy.parameters():
                 dy_param_value[param.name] = param.numpy()
 
+            return dy_out, dy_param_init_value, dy_param_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value = run_dygraph()
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -171,6 +182,16 @@ class TestImperativeMnist(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.equal(value, dy_param_value[key]).all())
 
+        # check eager
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.equal(value, eager_param_init_value[key]).all())
+
+        self.assertTrue(np.equal(static_out, eager_out).all())
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.equal(value, eager_param_value[key]).all())
+
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 8f8890557ad12328c88eedc25cb95a91f885eaf4..3fbb7f4cf7bb734892d55ff6440b51dd71ca2f3e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -24,6 +24,7 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 if fluid.is_compiled_with_cuda():
     fluid.set_flags({'FLAGS_cudnn_deterministic': True})
@@ -310,7 +311,8 @@ class TestImperativeResneXt(unittest.TestCase):
         batch_size = train_parameters["batch_size"]
         batch_num = 1
         epoch_num = 1
-        with fluid.dygraph.guard():
+
+        def run_dygraph():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
@@ -371,6 +373,17 @@ class TestImperativeResneXt(unittest.TestCase):
                     for param in se_resnext.parameters():
                         dy_param_value[param.name] = param.numpy()
 
+                    return dy_out, dy_param_init_value, dy_param_value, dy_grad_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value, dy_grad_value = run_dygraph(
+            )
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value, eager_grad_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -479,6 +492,32 @@ class TestImperativeResneXt(unittest.TestCase):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+        # check eager
+        self.assertTrue(
+            np.allclose(static_out, eager_out),
+            "\nstatic_out: {}\neager_out: {}".format(static_out, eager_out))
+
+        self.assertEqual(
+            len(eager_param_init_value), len(static_param_init_value))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, eager_param_init_value[key]))
+
+        self.assertEqual(len(eager_grad_value), len(static_grad_value))
+
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(
+                np.allclose(value, eager_grad_value[key]),
+                "\nstatic_grad_value: {}\neager_grad_value: {}".format(
+                    value, eager_grad_value[key]))
+
+        self.assertEqual(len(eager_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(
+                np.allclose(value, eager_param_value[key]),
+                "\nstatic_param_value: {}\neagear_param_value: {}".format(
+                    value, eager_param_value[key]))
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 33f304ef33d67004585a01142d24b17bfd4908bc..0a08aa4ba12693e2216ae1b131ea41a5abaabd2a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 3f129cae44abb6e8e9b4d329558aea5167c96675..010c8aeccacd6550a5b8963c63f3d28af898550e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -21,6 +21,7 @@ from paddle.fluid import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph import to_variable, guard
 from paddle.fluid.dygraph import TracedLayer
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid import core
 import numpy as np
 import six
@@ -949,8 +950,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
     def transformer_sort_gradient_float32(self, is_sparse):
         seed = 90
 
-        with guard():
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+        def run_dygraph():
             # NOTE(xiongkun03): In new executor, the inplace strategy is on by default, which will cause result of sumop have some differences. So we disable inplace.
             fluid.set_flags({'FLAGS_new_executor_use_inplace': False})
             paddle.seed(seed)
@@ -998,7 +998,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
 
             for i in range(batch_num):
                 enc_inputs, dec_inputs, label, weights = create_data()
-                if i % 2 == 0:
+                if False:
                     outs, traced_layer = TracedLayer.trace(
                         transformer, [enc_inputs, dec_inputs, label, weights])
 
@@ -1036,6 +1036,14 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
             dy_predict_value = dy_predict.numpy()
             dy_token_num_value = dy_token_num.numpy()
 
+            return dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated
+
+        with guard():
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated = run_dygraph()
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -1122,6 +1130,28 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         for key, value in six.iteritems(static_param_updated):
             self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
+        # check eager result
+        with guard():
+            fluid.set_flags({'FLAGS_sort_sum_gradient': False})
+            dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated = run_dygraph()
+
+        with guard():
+            with _test_eager_guard():
+                eager_avg_cost_value, eager_sum_cost_value, eager_predict_value, eager_token_num_value, \
+                    eager_param_init, eager_param_updated = run_dygraph()
+        self.assertTrue(np.allclose(dy_avg_cost_value, eager_avg_cost_value))
+        self.assertTrue(np.allclose(dy_sum_cost_value, eager_sum_cost_value))
+
+        self.assertTrue(np.allclose(dy_predict_value, eager_predict_value))
+        self.assertTrue(np.allclose(dy_token_num_value, eager_token_num_value))
+
+        for key, value in six.iteritems(static_param_init):
+            self.assertTrue(np.array_equal(value, eager_param_init[key]))
+        for key, value in six.iteritems(dy_param_updated):
+            self.assertTrue(np.allclose(value, eager_param_updated[key]))
+
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 077496200d988fafc67bb6f85892adc99c170daf..67f6b91021472ec3f678010f22bafb615cd8eb2c 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -23,6 +23,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid import compiler
 import paddle.fluid.unique_name as unique_name
+import paddle
 
 
 class TestInplaceANBOpTraining(unittest.TestCase):
@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
                 outs[0].name if not only_forward else None,
                 build_strategy=build_strategy,
                 exec_strategy=exec_strategy)
-            bn_fetches = exe.run(program=comp_prog1,
+            bn_fetches = exe.run(program=main,
                                  feed={'input': data},
                                  fetch_list=fetch_name)
             fetch_outs.append(bn_fetches)
             fetch_names.append(fetch_name)
 
-        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
-                                                           fetch_names)):
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(
+                fetch_outs + fetch_names)):
             self.assertTrue(
                 np.allclose(
                     bn_val, inplace_abn_val, atol=1e-2),
@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 
     def test_op(self):
         use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        #use_cudas = [False]
         for use_cuda in use_cudas:
             place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
             layouts = ["NCHW", "NHWC"]
@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 7dd310d2b88a90e09ba5ceedb541da4be263e559..ca9a489c7496f33cb084f1cd43158cebc7a1add6 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
         assert_equal(b_g_np_1, b_g_np_2)
 
 
+class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+
+        if dtype == "bfloat16":
+            x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16)
+
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+
+        y_np = y.cast('float32').numpy()
+        x_g_np = x_g.cast('float32').numpy()
+        w_g_np = w_g.cast('float32').numpy()
+        b_g_np = b_g.cast('float32').numpy()
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100):
+            return
+        x_np = np.random.random([10, 20]).astype('float32')
+        weight_np = np.random.random([20]).astype('float32')
+        bias_np = np.random.random([20]).astype('float32')
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, 'float32')
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, 'bfloat16')
+
+        def assert_equal(x, y):
+            self.assertTrue(np.allclose(x, y, atol=1.e-1))
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
 class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
     def test_main(self):
         self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index d1437ca9c96f1ba5fd2b9e1e420f91414d4f923a..16f954708d4d4149f46a18cfd48e35dfbe147153 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,8 +14,9 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 import paddle
+import paddle.fluid.core as core
 import paddle.nn.functional as F
 
 np.random.seed(10)
@@ -74,6 +75,33 @@ class TestLogSoftmaxAxis(TestLogSoftmaxOp):
         self.axis = 1
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLogSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.dtype = np.uint16
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+
+        x = np.random.uniform(0.1, 1., self.shape).astype(np.float32)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
 class TestNNLogSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
new file mode 100755
index 0000000000000000000000000000000000000000..12fb0fa61b005f49e6b258414cd5e3e10b0dae92
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.profiler as profiler
+
+
+class TestProfiler(unittest.TestCase):
+    def test_profiler(self):
+        def my_trace_back(prof):
+            profiler.export_chrome_tracing('./test_profiler_chrometracing/')(
+                prof)
+            profiler.export_protobuf('./test_profiler_pb/')(prof)
+
+        x_value = np.random.randn(2, 3, 3)
+        x = paddle.to_tensor(
+            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        y = x / 2.0
+        ones_like_y = paddle.ones_like(y)
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=(1, 2)) as prof:
+            with profiler.RecordEvent(name='test'):
+                y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=1, record=1, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=0, record=2, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(3):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+
+        def my_sheduler(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD_AND_RETURN
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        def my_sheduler1(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler1) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=1, ready=1, record=2, repeat=1, skip_first=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
+                prof.step()
+
+        prof.export(path='./test_profiler_pb.pb', format='pb')
+        prof.summary()
+        result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
new file mode 100755
index 0000000000000000000000000000000000000000..05e792003545688cfef3f1c7bd48ecf4d27daafe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.profiler.statistic_helper as statistic_helper
+
+
+class TestStatisticHelper(unittest.TestCase):
+    def test_sum_ranges_case1(self):
+        src = [(1, 3), (4, 10), (11, 15)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 12)
+
+    def test_sum_ranges_case2(self):
+        src = [(3, 3), (5, 5), (7, 7)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 0)
+
+    def test_merge_self_ranges_case1(self):
+        src = [(1, 5), (2, 7), (4, 9), (14, 19)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+        src = [(4, 9), (14, 19), (1, 5), (2, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+
+    def test_merge_self_ranges_case2(self):
+        src = [(1, 1), (2, 3), (4, 7), (5, 12)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+        src = [(5, 12), (1, 1), (2, 3), (4, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+
+    def test_merge_ranges_case1(self):
+        src1 = [(1, 2), (5, 7), (9, 14)]
+        src2 = [(1, 2), (4, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(1, 2), (3, 5)]
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        src1 = []
+        src2 = [(1, 2), (3, 5)]
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src2)
+        src1 = [(3, 4), (1, 2), (17, 19)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        src1 = [(1, 2), (5, 9), (12, 13)]
+        src2 = [(6, 8), (9, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+
+    def test_merge_ranges_case2(self):
+        src1 = [(3, 4), (1, 2), (9, 14)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 15)])
+        src2 = [(9, 14), (1, 2), (5, 7)]
+        src1 = [(4, 9), (1, 2), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+
+    def test_intersection_ranges_case1(self):
+        src1 = [(1, 7), (9, 12), (14, 18)]
+        src2 = [(3, 8), (10, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(3, 7), (10, 12)]
+        src2 = [(2, 9), (11, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+        dst = statistic_helper.intersection_ranges(src2, src1)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+
+    def test_intersection_ranges_case2(self):
+        src2 = [(9, 12), (1, 7), (14, 18)]
+        src1 = [(10, 13), (3, 8), (15, 19), (20, 22)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src2 = [(1, 7), (14, 18), (21, 23)]
+        src1 = [(6, 9), (10, 13)]
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(6, 7)])
+
+    def test_subtract_ranges_case1(self):
+        src1 = [(1, 10), (12, 15)]
+        src2 = [(3, 7), (9, 11)]
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 3), (7, 9), (12, 15)])
+        src1 = [(1, 10), (12, 15)]
+        src2 = []
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        dst = statistic_helper.subtract_ranges(src2, src1, True)
+        self.assertEqual(dst, src2)
+
+    def test_subtract_ranges_case2(self):
+        src2 = [(12, 15), (1, 10)]
+        src1 = [(9, 11), (3, 7)]
+        dst = statistic_helper.subtract_ranges(src1, src2)
+        self.assertEqual(dst, [(10, 11)])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index b20305b78efe2dfe73e069e13f0d0eca3bb84057..575bc653618a583e883783cd1fffe1db371eccff 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16):
         self.asvector = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestPnormBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "p_norm"
+        self.init_test_case()
+        self.x = (np.random.random(self.shape) + 0.5).astype(np.float32)
+        self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim,
+                           self.asvector)
+        self.gradient = self.calc_gradient()
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder),
+            'asvector': self.asvector
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.norm)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.dtype = np.uint16
+        self.asvector = False
+
+    def calc_gradient(self):
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder),
+            'asvector': self.asvector
+        }
+        x = self.x
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        asvector = self.attrs["asvector"]
+        x_dtype = x.dtype
+        x = x.astype(np.float32) if x.dtype == np.float16 else x
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            grad = np.power(norm, 1 - porder) * np.power(
+                np.abs(x), porder - 1) * np.sign(x)
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        divisor = numel if asvector else x.shape[axis]
+        numel /= divisor
+        return [grad.astype(x_dtype) * 1 / numel]
+
+
 def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index fe8c181b7904989d75d85520aaf6f658ac673d29..49fe397644dc6e79d1b0d436b31c706d77906b6a 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -21,6 +21,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
+import paddle
 
 from decorator_helper import prog_scope
 
@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 1cb39eb131b826cf7f3d7459caedcb296968bf27..b87e8d4e3c21aecc3eed8853ebf216719c4ad636 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -206,4 +206,5 @@ class TestTransformer(TestParallelExecutorBase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
index 1661f753a8464baa0c9497e9dbd0e348b5431750..15d9e0e2daa5e99498c010b3d3aaa6838cb253cb 100644
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -14,10 +14,12 @@
 
 import unittest
 import paddle.fluid as fluid
+import paddle
 
 fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
 
 from test_parallel_executor_transformer import TestTransformer
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
new file mode 100644
index 0000000000000000000000000000000000000000..838ccae37cfa5fb7dbdedcb5d39655cb62ad429f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.profiler as profiler
+
+
+class HostPythonNode:
+    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+
+
+class DevicePythonNode:
+    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
+                 stream_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.device_id = device_id
+        self.context_id = context_id
+        self.stream_id = stream_id
+
+
+class TestProfilerStatistic(unittest.TestCase):
+    def test_statistic_case1(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.append(sync_batch_norm_node)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 90)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 55)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 30)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 75)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 15)
+        self.assertEqual(len(event_summary.items), 2)
+        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 135)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index b01c7cf179955d89746555e3d085361784193b8c..a1a3b31a9766e093973ed927f90eebb989e1263b 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
 import seresnext_net
 from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
 from fake_reader import fake_imdb_reader
+import paddle
 
 
 def lstm_net(use_feed):
@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 82bfb88d54d51e4c17e37abef66d23e3a093feda..5f58054d7efc94c9fcc234396dfefdff7f17facc 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -20,6 +20,9 @@ from op_test import OpTest
 import paddle
 from paddle.fluid import core
 from paddle.static import program_guard, Program
+import os
+
+paddle.enable_static()
 
 
 def output_hist(out):
@@ -156,5 +159,47 @@ class TestRandintImperative(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Different GPU generatte different random value. Only test V100 here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+
+        x = paddle.randint(
+            -10000, 10000, [32, 3, 1024, 1024], dtype='int32').numpy()
+        self.assertTrue(x.mean(), -0.7517569760481516)
+        self.assertTrue(x.std(), 5773.696619107639)
+        expect = [2535, 2109, 5916, -5011, -261]
+        self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect))
+        expect = [3465, 7206, -8660, -9628, -6574]
+        self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect))
+        expect = [881, 1560, 1100, 9664, 1669]
+        self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect))
+
+        x = paddle.randint(
+            -10000, 10000, [32, 3, 1024, 1024], dtype='int64').numpy()
+        self.assertTrue(x.mean(), -1.461287518342336)
+        self.assertTrue(x.std(), 5773.023477548159)
+        expect = [7213, -9597, 754, 8129, -1158]
+        self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect))
+        expect = [-7159, 8054, 7675, 6980, 8506]
+        self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect))
+        expect = [3581, 3420, -8027, -5237, -2436]
+        self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index faa67e1d6da8f44bf1a09036d0d1dc9e49ff462c..d246356b4ec75a96162d0b37d4d1cbfab9493440 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSumOp_bf16(OpTest):
+    def setUp(self):
+        np.random.seed(100)
+        self.op_type = "reduce_sum"
+        self.dtype = np.uint16
+        self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32)
+        self.attrs = {'dim': [0, 1, 2]}
+        self.out = self.x.sum(axis=tuple(self.attrs['dim']))
+        self.gradient = self.calc_gradient()
+
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.gradient = self.calc_gradient()
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+    def calc_gradient(self):
+        x = self.x
+        grad = np.ones(x.shape, dtype=x.dtype)
+        return [grad]
+
+
 class TestSumOp_fp16_withInt(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index c860d6972fb762f5c9cde1fb17c36bfb415a3f8a..40481b097827cbc7b66d72403d24fd41853af0a7 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index c1ce032f506127e495dfd3231471fdabe6dfa26b..d432b8057f624831f40b8cd48a0ede694f8d0a55 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp):
                 place, ["X"], "Out", max_relative_error=0.05)
 
 
+class TestScaleBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.uint16
+        self.attrs = {'scale': -2.3}
+        x = np.random.random((10, 10)).astype(np.float32)
+        out = x * np.float32(self.attrs['scale'])
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.8)
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index e71adae8d9b6eb5cdfdbd648a3cfe653cfed3d3d..f16198817945ab826004d51f1264a9cbe8fc22a9 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -42,6 +42,7 @@ def ref_selu(x,
 class SeluTest(OpTest):
     def setUp(self):
         self.op_type = "selu"
+        self.python_api = paddle.nn.functional.selu
         self.x_shape = [3, 5, 5, 10]
         self.dtype = np.float64
         self.init_x_shape()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index a1cbefa40f307f5cdc1a64feaa51573f68a259f5..4f1c37a242474a63078336cbbecae06d78e5cdbd 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
@@ -296,6 +296,56 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.use_cudnn = self.init_cudnn()
+        self.use_mkldnn = False
+        self.dtype = np.uint16
+        self.shape = [10, 10]
+        self.axis = -1
+
+        np.random.seed(0)
+        x = np.random.uniform(0.1, 1, self.shape).astype(np.float32)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {
+            'axis': self.axis,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
+        }
+
+    def init_cudnn(self):
+        return False
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(
+            place, check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ["X"],
+            "Out",
+            numeric_grad_delta=0.05,
+            check_dygraph=(self.use_mkldnn == False))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op):
+    def init_cudnn(self):
+        return True
+
+
 class TestSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ce8e8170fa187b223fe48aac6124fa0b736e17
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_attention.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6b637e1b45ed51dbefd2f8a6ac4bcf4d1f4e59
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_feedforward.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index eddccd4ff24f1a8b7c23bda3da813bc87c199cbe..7040145a76833588f0a5738b1b09e10061497e8c 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent):
     globals()[cls_name] = TestSumFp16Case
 
 
+#----------- test bf16 -----------
+class TestSumBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(np.float32)
+        x1 = np.random.random((3, 40)).astype(np.float32)
+        x2 = np.random.random((3, 40)).astype(np.float32)
+        y = x0 + x1 + x2
+        self.inputs = {
+            "X": [("x0", convert_float_to_uint16(x0)),
+                  ("x1", convert_float_to_uint16(x1)),
+                  ("x2", convert_float_to_uint16(x2))]
+        }
+        self.outputs = {'Out': convert_float_to_uint16(y)}
+
+    def init_kernel_type(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
+
+
 class API_Test_Add_n(unittest.TestCase):
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 3238876b89414b89d09a8b4161ef9e5ba2450261..aac8b6a99b649176d29224e22c3d3258d96c194e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -533,12 +533,8 @@ class TestTensorRegisterHook(unittest.TestCase):
             size=[self.batch_size, self.in_size]).astype('float32')
         data_t = paddle.to_tensor(data)
 
-        if _in_eager_mode():
-            with self.assertRaises(TypeError):
-                out = jit_net(data_t)
-        else:
-            with self.assertRaises(AssertionError):
-                out = jit_net(data_t)
+        with self.assertRaises(AssertionError):
+            out = jit_net(data_t)
 
     def test_register_hook_in_dy2static_mode(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 0fc56726c5d0e57f88ec38db95c76495931d0f26..1e6b4354dd9c8d4f3c345067ead4d64fcad12aeb 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -423,6 +423,14 @@ class TestMoveAxis(unittest.TestCase):
         self.assertEqual(np.array_equal(out.numpy(), expected), True)
         paddle.enable_static()
 
+    def test_moveaxis3(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(
+            [[1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j]])
+        out = x.moveaxis(0, 1)
+        self.assertEqual(out.shape, [2, 3])
+        paddle.enable_static()
+
     def test_error(self):
         x = paddle.randn([2, 3, 4, 5])
         # src must have the same number with dst
@@ -455,4 +463,5 @@ class TestMoveAxis(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 08a35db3ac4374e392adadfb06410667448daa5c..b70fa04adc13cfd16c43010cce46b31893052927 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -79,6 +79,16 @@ class TestTruncAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager(self):
+        paddle.disable_static(self.place)
+
+        with _test_eager_guard():
+            x_tensor = paddle.to_tensor(self.x)
+            out = paddle.trunc(x_tensor)
+        out_ref = np.trunc(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
+        paddle.enable_static()
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_api_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 541df6659c2d5620e449d2aee0707987ee43d042..dbd40c349bbc81d39b8a929ee5b3e7b81a083406 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -51,6 +51,10 @@ class TestVarBase(unittest.TestCase):
                     np.array_equal(x.numpy(), np.array([1.2], 'float16')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP16)
 
+                # set_default_dtype take effect on int
+                x = paddle.to_tensor(1, place=place)
+                self.assertTrue(x.dtype, core.VarDesc.VarType.INT64)
+
                 # set_default_dtype take effect on float
                 x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
                 self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 043c5c1651a09ac022d8a694b2e916b613c77f6b..f210d97362cf062260594dce1112059919f179c4 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -260,5 +260,6 @@ class TestYoloBoxOpHW(TestYoloBoxOp):
         self.iou_aware_factor = 0.5
 
 
-if (__name__ == '__main__'):
+if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 1ec1d1527e178a35fba5a2b949191f72e0a7726b..3f0e4f7a4002a29a2f3f17915392b13fdb342677 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -305,6 +305,7 @@ class TestYolov3LossDygraph(unittest.TestCase):
             use_label_smooth=True,
             scale_x_y=1.)
         assert loss is not None
+        assert loss.shape == [2]
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 6a7e5f08b5e489511d2060d92dc0390dd4b18dbc..d50c0fecdeebc79a98f66037080d1a03d73f3924 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -84,13 +84,33 @@ class XPUTestSigmoidOP(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = "sigmoid"
             self.dtype = self.in_type
+            self.init_config()
+            out = 1 / (1 + np.exp(-self.x))
 
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = 1 / (1 + np.exp(-x))
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSigmoid2(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSigmoid3(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [10, 12, 15]).astype(self.dtype)
+
+    class XPUTestSigmoid4(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [19, 19]).astype(self.dtype)
+
+    class XPUTestSigmoid5(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2,
+                                       [10, 20, 30, 40]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('sigmoid')
 for stype in support_types:
@@ -292,14 +312,32 @@ class XPUTestSquareOP(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = "square"
             self.dtype = self.in_type
-
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = np.square(x)
+            self.init_config()
+            out = np.square(self.x)
 
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSquare2(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSquare3(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 15, 19]).astype(self.dtype)
+
+    class XPUTestSquare4(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100, 10]).astype(self.dtype)
+
+    class XPUTestSquare5(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 2, 5, 17]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('square')
 for stype in support_types:
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 45d60c8538e092f4c5d97f6525870af33a6ad9d5..9891da6ea21d9ac7c8c591f71183099858832140 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -289,6 +289,18 @@ class TestMatMulOp17(TestMatMulV2Op):
         self.trans_y = False
 
 
+class TestMatMulOp18(TestMatMulV2Op):
+    """
+    case 18 : for ppyoloe model
+    """
+
+    def config(self):
+        self.x_shape = (8, 111, 4, 17)
+        self.y_shape = (17)
+        self.trans_x = False
+        self.trans_y = False
+
+
 # class TestMatMulOpBroadcast1(TestMatMulV2Op):
 #     """
 #     case 14_3
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 8c1ce68e9d0f8b83b916ab45139745b224eff4f1..7a3b4a5a2179a94ed56ebf7968619c847c0e2090 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
 import sys
 sys.path.append("..")
+
+import paddle
+
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -158,390 +159,356 @@ def nearest_neighbor_interp3d_np(X,
     return out.astype(X.dtype)
 
 
-class TestNearestInterpOp(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
-            in_d = 1
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_d = 1
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
-            in_d = self.input_shape[2]
-            in_h = self.input_shape[3]
-            in_w = self.input_shape[4]
-        else:
-            in_d = self.input_shape[1]
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        scale_d = 0
-        scale_h = 0
-        scale_w = 0
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_d = scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_d = scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                if len(self.scale) == 5:
-                    scale_w = self.scale[2]
-                    scale_h = self.scale[1]
-                    scale_d = self.scale[0]
-                else:
-                    scale_w = self.scale[1]
-                    scale_h = self.scale[0]
+class XPUNearestInterpOpWrapper(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'nearest_interp_v2'
+        self.use_dynamic_create_class = False
 
-            out_h = int(in_h * scale_h)
-            out_w = int(in_w * scale_w)
-            out_d = int(in_d * scale_d)
-        else:
-            if len(self.input_shape) == 5:
-                out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
+    class TestNearestInterpOp(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
 
-        if len(self.input_shape) == 4:
-            output_np = nearest_neighbor_interp_np(
-                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
-                self.actual_shape, self.align_corners, self.data_layout)
-        elif len(self.input_shape) == 5:
-            output_np = nearest_neighbor_interp3d_np(
-                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
-                self.out_size, self.actual_shape, self.align_corners,
-                self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        if len(self.input_shape) == 5:
-            self.attrs = {
-                'out_d': self.out_d,
-                'out_h': self.out_h,
-                'out_w': self.out_w,
-                'interp_method': self.interp_method,
-                'align_corners': self.align_corners,
-                'data_layout': self.data_layout
-            }
-        else:
+            self.out_size = None
+            self.actual_shape = None
+            self.data_layout = 'NCHW'
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+            # in
+            if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+                in_d = 1
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+            else:
+                in_d = 1
+                in_h = self.input_shape[1]
+                in_w = self.input_shape[2]
+
+            if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+                in_d = self.input_shape[2]
+                in_h = self.input_shape[3]
+                in_w = self.input_shape[4]
+            else:
+                in_d = self.input_shape[1]
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+
+            # scale
+            scale_d = 0
+            scale_h = 0
+            scale_w = 0
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_d = scale_h = scale_w = float(self.scale)
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_d = scale_w = scale_h = self.scale[0]
+                    self.scale = [self.scale[0], self.scale[0]]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    if len(self.scale) == 5:
+                        scale_w = self.scale[2]
+                        scale_h = self.scale[1]
+                        scale_d = self.scale[0]
+                    else:
+                        scale_w = self.scale[1]
+                        scale_h = self.scale[0]
+
+                out_h = int(in_h * scale_h)
+                out_w = int(in_w * scale_w)
+                out_d = int(in_d * scale_d)
+            else:
+                if len(self.input_shape) == 5:
+                    out_d = self.out_d
+                out_h = self.out_h
+                out_w = self.out_w
+
+            # output_np
+            if len(self.input_shape) == 4:
+                output_np = nearest_neighbor_interp_np(
+                    input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                    self.actual_shape, self.align_corners, self.data_layout)
+            elif len(self.input_shape) == 5:
+                output_np = nearest_neighbor_interp3d_np(
+                    input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                    self.out_size, self.actual_shape, self.align_corners,
+                    self.data_layout)
+            self.outputs = {'Out': output_np}
+
+            self.inputs = {'X': input_np}
+            if self.out_size is not None:
+                self.inputs['OutSize'] = self.out_size
+            if self.actual_shape is not None:
+                self.inputs['OutSize'] = self.actual_shape
+
+            if len(self.input_shape) == 5:
+                self.attrs = {
+                    'out_d': self.out_d,
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+            else:
+                self.attrs = {
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+
+            if self.scale:
+                self.attrs['scale'] = self.scale
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 3, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.out_size = np.array([3, 3]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [4, 1, 1, 7, 8]
+            self.out_d = 1
+            self.out_h = 1
+            self.out_w = 1
+            self.scale = 0.
+            self.align_corners = True
+    """
+
+    class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+
+    class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+
+    class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [4, 1, 7, 8]
+            self.out_h = 1
+            self.out_w = 1
+            self.out_size = np.array([2, 2]).astype("int32")
+
+    class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = np.array([11, 11]).astype("int32")
+
+    class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([65, 129]).astype("int32")
+
+    class TestNearestNeighborInterpSame(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [2, 3, 32, 64]
+            self.out_h = 32
+            self.out_w = 64
+
+    class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support NHWC data_layout
+    class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [2, 4, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.scale = 0.
+            self.out_size = np.array([3, 8]).astype("int32")
+            self.align_corners = True
+            self.data_layout = "NHWC"
+    """
+
+    class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+        def set_align_corners(self):
+            self.align_corners = False
+
+    class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 5, 7]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 1.5
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [2.0, 3.0]
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [3, 2, 4, 7, 5]
+            self.out_d = 8
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [4.0, 2.0, 3.0]
+            self.out_size = np.array([8, 66, 40]).astype("int32")
+            self.align_corners = True
+    """
+
+    class TestNearestInterpOp_attr_tensor(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+
+            self.out_size = None
+            self.actual_shape = None
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            self.shape_by_1Dtensor = False
+            self.scale_by_1Dtensor = False
             self.attrs = {
-                'out_h': self.out_h,
-                'out_w': self.out_w,
                 'interp_method': self.interp_method,
                 'align_corners': self.align_corners,
-                'data_layout': self.data_layout
             }
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 1, 7, 8]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-"""
-
-
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support NHWC data_layout
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-"""
-
-
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [2.0, 3.0]
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighbor3DInterp(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 4, 7, 5]
-        self.out_d = 8
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [4.0, 2.0, 3.0]
-        self.out_size = np.array([8, 66, 40]).astype("int32")
-        self.align_corners = True
-"""
-
-
-class TestNearestInterpOp_attr_tensor(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
-            out_h = int(self.input_shape[2] * scale_h)
-            out_w = int(self.input_shape[3] * scale_w)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
 
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+            self.inputs = {'X': input_np}
+
+            if self.scale_by_1Dtensor:
+                self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+            elif self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_h = scale_w = float(self.scale)
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_w = scale_h = self.scale[0]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+                out_h = int(self.input_shape[2] * scale_h)
+                out_w = int(self.input_shape[3] * scale_w)
+            else:
+                out_h = self.out_h
+                out_w = self.out_w
+
+            if self.shape_by_1Dtensor:
+                self.inputs['OutSize'] = self.out_size
+            elif self.out_size is not None:
+                size_tensor = []
+                for index, ele in enumerate(self.out_size):
+                    size_tensor.append(("x" + str(index), np.ones(
+                        (1)).astype('int32') * ele))
+                self.inputs['SizeTensor'] = size_tensor
+
+            self.attrs['out_h'] = self.out_h
+            self.attrs['out_w'] = self.out_w
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    self.scale = [self.scale[0], self.scale[0]]
+                self.attrs['scale'] = self.scale
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+                self.align_corners)
+            self.outputs = {'Out': output_np}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 5, 4, 4]
+            self.out_h = 3
+            self.out_w = 3
+            self.out_size = [3, 3]
+
+    # out_size is a tensor list
+    class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = [8, 12]
+
+    # out_size is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+            self.shape_by_1Dtensor = True
+
+    # scale is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.0
+            self.out_size = None
+            self.scale_by_1Dtensor = True
+
+
+support_types = get_xpu_op_support_types('nearest_interp_v2')
+for stype in support_types:
+    create_test_class(globals(), XPUNearestInterpOpWrapper, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..785549abba8f3ad732dd49263532fcf453f0517b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.tensor as tensor
+import unittest
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from paddle.fluid.framework import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestTrilTriuOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tril_triu'
+        self.use_dynamic_create_class = False
+
+    class TestTrilTriuOp(XPUOpTest):
+        def setUp(self):
+            self.init_dtype()
+            self.initTestCase()
+            self.real_op_type = np.random.choice(['triu', 'tril'])
+            self.real_np_op = getattr(np, self.real_op_type)
+            self.set_xpu()
+            self.op_type = "tril_triu"
+            if self.dtype == np.int32:
+                self.X = np.arange(
+                    1, self.get_Xshape_prod() + 1,
+                    dtype=self.dtype).reshape(self.Xshape)
+            else:
+                self.X = np.random.random(self.Xshape).astype(dtype=self.dtype)
+            self.inputs = {'X': self.X}
+            self.attrs = {
+                'diagonal': self.diagonal,
+                'lower': True if self.real_op_type == 'tril' else False,
+            }
+            self.outputs = {
+                'Out': self.real_np_op(self.X, self.diagonal)
+                if self.diagonal else self.real_np_op(self.X)
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def get_Xshape_prod(self):
+            ret = 1
+            for v in self.Xshape:
+                ret *= v
+            return ret
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.real_op_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+        def initTestCase(self):
+            self.diagonal = None
+            self.Xshape = (10, 10)
+
+    class TestTrilTriuOp1(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -3
+            self.Xshape = (5, 5)
+
+    class TestTrilTriuOp2(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 4
+            self.Xshape = (11, 17)
+
+    class TestTrilTriuOp3(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 10
+            self.Xshape = (25, 25)
+
+    class TestTrilTriuOp4(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -10
+            self.Xshape = (33, 11)
+
+    class TestTrilTriuOp5(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 11
+            self.Xshape = (1, 99)
+
+
+class TestTrilTriuOpError(unittest.TestCase):
+    def test_errors1(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(20, 22), dtype='float32', name="data1")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "diagonal: TypeError":
+            "diagonal in {} must be a python Int".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal='2022')
+
+    def test_errors2(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(200, ), dtype='float32', name="data2")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "input: ValueError":
+            "x shape in {} must be at least 2-D".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal=[None])
+
+
+support_types = get_xpu_op_support_types('tril_triu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTrilTriuOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 94b8bd29b2c19b84323152c9566d4017ae4772c5..f2d41b5e9b1f079d2ca0956bd4ce6e014ba004d7 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -46,6 +46,10 @@ def _build_saved_state_dict(state_dict):
             if value.type == core.VarDesc.VarType.VOCAB:
                 save_dict[key] = value.value().get_map_tensor()
             else:
+                if not value.value().get_tensor()._is_initialized():
+                    raise ValueError(
+                        "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model."
+                    )
                 save_dict[key] = value.numpy()
             name_table[key] = value.name
         else:
@@ -466,7 +470,9 @@ def _parse_load_result(obj, return_numpy):
 
 def _save_lod_tensor(tensor, file_name):
     if not tensor._is_initialized():
-        raise ValueError("The saved tensor is not initialized.")
+        raise ValueError(
+            "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model firstly."
+        )
     if _is_file_path(file_name):
         _seek = core.save_lod_tensor(tensor, file_name)
         # '_seek' is the end position of this tensor in the file.
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index d600cda8454cc696579df7fa7f6e6f4d6ae12600..457422ae3a4d602a6782d0949030fb3b3fb797c2 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -223,12 +223,14 @@ def fused_multi_head_attention(x,
                                pre_ln_epsilon=1e-05,
                                qkv_bias=None,
                                linear_bias=None,
+                               cache_kv=None,
                                attn_mask=None,
                                dropout_rate=0.5,
                                attn_dropout_rate=0.5,
                                ln_epsilon=1e-05,
                                training=True,
                                mode='upscale_in_train',
+                               ring_id=-1,
                                name=None):
     r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
@@ -242,8 +244,8 @@ def fused_multi_head_attention(x,
     	    out = layer_norm(x)
             out = linear(out) + qkv) + bias
     	else:
-	    out = linear(x) + bias
-    	out = transpose(out, perm=[2, 0, 3, 1, 4])
+            out = linear(x) + bias
+            out = transpose(out, perm=[2, 0, 3, 1, 4])
     	# extract q, k and v from out.
     	q = out[0:1,::]
     	k = out[1:2,::]
@@ -257,8 +259,8 @@ def fused_multi_head_attention(x,
     	out = out_linear(out)
     	if pre_layer_norm:
     	    out = x + dropout(linear_bias + out)
-	else:
-    	    out = layer_norm(x + dropout(linear_bias + out))
+        else:
+            out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -276,6 +278,7 @@ def fused_multi_head_attention(x,
         qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`.
             Default None.
         linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        cache_kv (Tensor, optional): For generation model, cache structure. The shape is `[2, bsz, num_head, seq_len, head_dim]`. Default None.
         attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to
  	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor
             with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the
@@ -303,6 +306,7 @@ def fused_multi_head_attention(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in mp, only support NCCL and forward. Default is -1, means not using mp
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -333,7 +337,7 @@ def fused_multi_head_attention(x,
             output = F.fused_multi_head_attention(
                 x, qkv_weight, linear_weight, False,
                 None, None, None, None, 1e-5, qkv_bias,
-                linear_bias, attn_mask)
+                linear_bias, None, attn_mask)
             # [2, 4, 128]
             print(output.shape)
     """
@@ -359,17 +363,20 @@ def fused_multi_head_attention(x,
         assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
             3], "embed_dim must be divisible by num_heads."
 
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, final_out = _C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, attn_mask,
-            linear_weight, linear_bias, ln_scale, ln_bias, 'pre_layer_norm',
-            pre_layer_norm, 'epsilon', pre_ln_epsilon, 'dropout_rate',
-            dropout_rate, 'attn_dropout_rate', attn_dropout_rate, 'ln_epsilon',
-            ln_epsilon, 'attn_dropout_is_test', not training, 'dropout_is_test',
-            not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
+        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _C_ops.fused_attention(
+            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
+            attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
+            'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
+            'dropout_rate', dropout_rate, 'attn_dropout_rate',
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
+            not training, 'dropout_is_test', not training,
+            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
+            seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
-            'dropout_implementation', mode)
+            'dropout_implementation', mode, 'ring_id', ring_id)
+        if cache_kv is not None:
+            return final_out, cache_kv_out
         return final_out
     else:
         helper = LayerHelper('fused_multi_head_attention', **locals())
@@ -398,6 +405,7 @@ def fused_multi_head_attention(x,
             inputs['Ln2Scale'] = [ln_scale]
         if ln_bias:
             inputs['Ln2Bias'] = [ln_bias]
+        if cache_kv: inputs['CacheKV'] = [cache_kv]
 
         if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
             seed = helper.main_program.random_seed
@@ -417,6 +425,7 @@ def fused_multi_head_attention(x,
             'dropout_seed': seed if seed is not None else 0,
             'attn_dropout_implementation': mode,
             'dropout_implementation': mode,
+            'ring_id': ring_id
         }
 
         # set outputs
@@ -449,6 +458,7 @@ def fused_multi_head_attention(x,
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
             dtype=dtype)
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
+        cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
         helper.append_op(
             type='fused_attention',
@@ -472,7 +482,9 @@ def fused_multi_head_attention(x,
                 "Ln2Mean": ln_mean_out,
                 "Ln2Variance": ln_variance_out,
                 "BiasDropoutResidualOut": bias_dropout_residual_out,
-                'Y': final_out
+                'Y': final_out,
+                'CacheKVOut': cache_kv_out
             },
             attrs=attrs)
-        return final_out
+
+        return (final_out, cache_kv_out) if cache_kv else final_out
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index e7c3cfbb7b93b5deffb95e9ee175a7a03d1aaf7f..cc33a909632766e81bfabdb73cc3a1e177c1fe1a 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer):
         moment2.is_distributed = True
         beta1pow = self._create_persistable_var('beta1pow')
         beta2pow = self._create_persistable_var('beta2pow')
-        fused_indices = self._create_persistable_var(
-            'fused_indices', dtype='int32')
-        weight_decay = self._create_persistable_var('weight_decay')
-        weight_decay.is_distributed = True
+
         param_info = self._create_persistable_var('param_info', dtype='int32')
         param_info.is_distributed = True
 
@@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer):
             'fp16_partial_fused_offsets', dtype='int32')
         fp16_partial_fused_offsets.is_distributed = True
 
+        param_order = self._create_persistable_var('param_order', dtype='int32')
+        param_order.is_distributed = True
+
         rank = get_rank()
         nranks = get_world_size()
         scale = self._get_or_create_scale()
 
         params = [p for p, _ in params_grads]
         grads = [g for _, g in params_grads]
-        weight_decay_values = [self._weight_decay] * len(params)
+        apply_weight_decay = [1] * len(params)
         if self._exclude_from_weight_decay_fn is not None:
             for i, p in enumerate(params):
                 if self._exclude_from_weight_decay_fn(p):
-                    weight_decay_values[i] = 0.0
+                    apply_weight_decay[i] = 0
 
         startup_block = self.helper.startup_program.global_block()
         for g in grads:
@@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer):
                 'Moment2': [moment2],
                 'Beta1Pow': [beta1pow],
                 'Beta2Pow': [beta2pow],
-                'FusedIndices': [fused_indices],
-                'WeightDecay': [weight_decay],
                 'GlobalScale': [scale],
                 'ParamInfo': [param_info],
                 'ParamOut': params,
@@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer):
                 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
                 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
                 'FusedParamOffsets': [fused_offsets],
+                'ParamOrder': [param_order],
             },
             attrs={
                 'alignment': self._alignment,
                 'rank': rank,
                 'nranks': nranks,
-                'weight_decay': weight_decay_values,
+                'apply_weight_decay': apply_weight_decay,
                 'moment1': 0.0,
                 'moment2': 0.0,
                 'beta1': self._beta1,
@@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer):
                 'Moment2': [moment2],
                 'Beta1Pow': [beta1pow],
                 'Beta2Pow': [beta2pow],
-                'FusedIndices': [fused_indices],
-                'WeightDecay': [weight_decay],
                 'GlobalScale': [scale],
                 'ParamInfo': [param_info],
                 'Param': params,
@@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer):
                 'FusedParamOffsets': [fused_offsets],
                 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
                 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
+                'ParamOrder': [param_order],
             },
             outputs={
                 'FP32FusedParamOut': [fp32_fused_param],
@@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer):
                 'FoundInf': [self._found_inf],
             },
             attrs={
+                'weight_decay': self._weight_decay,
                 'beta1': self._beta1,
                 'beta2': self._beta2,
                 'epsilon': self._epsilon,
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index ed668ed124c2318c88cde2bd4acb31ce2b2e4f7c..9e78ca6be3f2749e43963f63cdb8b6983f651697 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -351,7 +351,6 @@ def interpolate(x,
 
     out_shape = size
     scale = scale_factor
-
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
@@ -362,6 +361,8 @@ def interpolate(x,
             if in_dynamic_mode():
                 if isinstance(out_shape, Variable):
                     out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
                 for i, dim in enumerate(out_shape):
                     if isinstance(dim, Variable):
                         out_shape[i] = dim.numpy()[0]
@@ -1818,7 +1819,6 @@ def fold(x,
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1826,21 +1826,21 @@ def fold(x,
     Parameters:
         x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
                                   data type can be float32 or float64
-        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+        output_sizes(int|list|tuple):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1859,9 +1859,9 @@ def fold(x,
             import paddle
             import paddle.nn.functional as F
 
-            x = paddle.randn([2,12,9])
-            y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2)
-            # y.shape = [2,3,4,4]
+            x = paddle.randn([2,3*2*2,12])
+            y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
+            # y.shape = [2,3,4,5]
 
     """
 
@@ -1872,29 +1872,32 @@ def fold(x,
     assert len(x.shape) == 3, \
             "input should be the format of [N, C, L]"
 
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
     if isinstance(output_sizes, int):
         output_sizes = [output_sizes, output_sizes]
     else:
-        assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \
-            "output_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \
+            "output_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(strides) and (len(strides) == 2), \
+            "strides should either be an integer or a list/tuple of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list/tuple of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -1912,16 +1915,21 @@ def fold(x,
             "Unexpected type of paddings, it should be either an integer or a list"
             "of 2 or 4 integers")
 
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="fold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "output_sizes": output_sizes,
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
+    if in_dynamic_mode():
+        out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes",
+                          kernel_sizes, "strides", strides, "paddings",
+                          paddings, "dilations", dilations)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type="fold",
+            inputs={"X": x},
+            outputs={"Y": out},
+            attrs={
+                "output_sizes": output_sizes,
+                "kernel_sizes": kernel_sizes,
+                "strides": strides,
+                "paddings": paddings,
+                "dilations": dilations
+            })
     return out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e59ef5ebfb0ab26c16c78933733bc11c0c4148d0..e6efde836284ac361f9781a0cb18b0df72afe354 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1667,11 +1667,11 @@ def cross_entropy(input,
             label_min = paddle.min(valid_label)
             label_max = paddle.max(valid_label)
             if label_min < 0:
-                raise ValueError("label should not out of bound, but got{}".
-                                 format(label_min))
+                raise ValueError("Target {} is out of lower bound.".format(
+                    label_min.item()))
             if label_max >= input.shape[axis]:
-                raise ValueError("label should not out of bound, but got{}".
-                                 format(label_max))
+                raise ValueError("Target {} is out of upper bound.".format(
+                    label_max.item()))
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 19fbcd5b6f85691e57530a442d9f72ce7935692d..dac4cf5f2725333952d3710df3c5629d6566197f 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1565,7 +1565,6 @@ class Fold(Layer):
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1573,19 +1572,19 @@ class Fold(Layer):
     Parameters:
         output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1604,10 +1603,10 @@ class Fold(Layer):
             import paddle
             import paddle.nn as nn
 
-            x = paddle.randn([2,12,9])
-            fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2)
+            x = paddle.randn([2,3*2*2,12])
+            fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2)
             y = fold(x)
-            # y.shape = [2,3,4,4]
+            # y.shape = [2,3,4,5]
    """
 
     def __init__(self,
diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4999e703f2a5a31be2cd5c20b70bc7b9dfb7e60a
--- /dev/null
+++ b/python/paddle/profiler/__init__.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .profiler import ProfilerState, ProfilerTarget
+from .profiler import make_scheduler, export_chrome_tracing, export_protobuf
+from .profiler import Profiler
+from .profiler import TracerEventType
+from .utils import RecordEvent, load_profiler_result
+from .profiler_statistic import SortedKeys
+
+__all__ = [
+    'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
+    'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
+    'load_profiler_result', 'SortedKeys'
+]
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc637bf983046b8025962257744b0e1bb4763b4b
--- /dev/null
+++ b/python/paddle/profiler/profiler.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+import datetime
+from enum import Enum
+from typing import Any, Callable, Iterable, Optional, Union
+from warnings import warn
+
+import paddle
+from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
+                               TracerEventType)
+
+from .utils import RecordEvent, wrap_optimizers
+from .profiler_statistic import SortedKeys
+
+
+class ProfilerState(Enum):
+    r"""
+    Profiler state that can be specified to control profiler action.
+
+    CLOSED: The profilers are closed.
+    READY:  The profilers are open, but the data will not be recorded.
+            This state is used for reducing overhead influence when profilers start.
+    RECORD: The profilers are open, and the data will be recorded.
+    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, 
+            the collected data will be returned.
+    """
+    CLOSED = 0
+    READY = 1
+    RECORD = 2
+    RECORD_AND_RETURN = 3  # the last step of RECORD 
+
+
+class ProfilerTarget(Enum):
+    r"""
+    Target device for profiling.
+    """
+    CPU = 0
+    GPU = 1
+
+
+def make_scheduler(*,
+                   closed: int,
+                   ready: int,
+                   record: int,
+                   repeat: int=0,
+                   skip_first: int=0) -> Callable:
+    r"""
+    Return a scheduler function, which scheduler the state according to the setting.
+    The state transform confirms to:
+
+    (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
+    START -> skip_first -> closed -> ready    ->    record       ->      END
+                            |                        |
+                            |                        | (if has_repeated < repeat)
+                            - - - - - - - - - - - -
+    Note that repeat <= 0 means the cycle will continue until the profiler exits.    
+
+    Parameters:
+        closed(int): The number of steps in state ProfilerState.CLOSED.
+        ready(int):  The number of steps in state ProfilerState.READY. 
+        record(int): The number of steps in state ProfilerState.RECORD.    
+        repeat(int): The number of cycles to repeat above state transform.
+        skip_first(int): The number of first steps to drop, not participate in the state transform.
+
+    Returns:
+        A scheduler function, conforms to above state transform setting.
+
+    Examples:
+        1. profiling range [2, 5]
+        batch 0: closed, batch 1: ready, batch [2, 5] record
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, repeat=1)
+        2. profiling range [3,6], [9,12], [15,18]...
+        batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, skip_first=1)
+    """
+
+    def getScheduleState(step: int) -> ProfilerState:
+        assert step >= 0
+        if step < skip_first:  # within skip_first, just skip
+            return ProfilerState.CLOSED
+        step = step - skip_first
+        period_steps = closed + ready + record
+        has_repeated = step // period_steps
+        if repeat > 0 and has_repeated >= repeat:  # the period has repeated repeat times, return CLOSED state
+            return ProfilerState.CLOSED
+        mod_step = step % period_steps
+        if mod_step < closed:
+            return ProfilerState.CLOSED
+        elif mod_step >= closed and mod_step < closed + ready:
+            return ProfilerState.READY
+        else:
+            if mod_step < period_steps - 1:
+                return ProfilerState.RECORD
+            else:
+                return ProfilerState.RECORD_AND_RETURN
+    assert closed >= 0 and ready >= 0 and record > 0 and \
+             repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments"
+    if ready == 0:
+        warn("Profiler will record data after enabling profiler immediately, \
+          some data collected at the beginning of profiling may be 'noisy' because of overhead."
+             )
+    return getScheduleState
+
+
+def _default_state_scheduler(step: int):
+    r"""
+    A default state scheduler, keep recording from the begining of the profiler until ending.
+    """
+    return ProfilerState.RECORD
+
+
+def export_chrome_tracing(dir_name: str,
+                          worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to chrome tracing format file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.json'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "json")
+
+    return handle_fn
+
+
+def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to protobuf file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_protobuf('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.pb'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "pb")
+
+    return handle_fn
+
+
+def _get_supported_targets() -> Iterable[ProfilerTarget]:
+    r"""
+    Get the current supported profiler target in the system.
+    """
+    if paddle.device.is_compiled_with_cuda():
+        return [ProfilerTarget.CPU, ProfilerTarget.GPU]
+    return [ProfilerTarget.CPU]
+
+
+class Profiler:
+    r"""
+    Profiler context manager, user interface to manage profile process.
+
+    Parameters:
+        targets (iterable): list of tracing targets, currently supported values:
+        ``paddle.profiler.ProfilerTarget.CPU``,
+        ``paddle.profiler.ProfilerTarget.GPU``.
+        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. 
+            If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+            which means profiling range [start_batch, end_batch).
+        on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
+            This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+            
+    Examples:
+        1. profiling range [2, 5)
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (2, 5),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        2. profiling range [2,4], [7, 9], [11,13]
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        3. Use profiler without context manager, and use default parameters
+        .. code-block:: python
+        import paddle.profiler as profiler
+        p = profiler.Profiler()
+        p.start()
+        for iter in range(N):
+            train()
+            p.step()
+        p.stop()
+        p.summary()
+    """
+
+    def __init__(
+            self,
+            *,
+            targets: Optional[Iterable[ProfilerTarget]]=None,
+            scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None,
+            on_trace_ready: Optional[Callable[..., Any]]=None):
+        supported_targets = _get_supported_targets()
+        if targets:
+            self.targets = set(targets)
+            for target in targets:
+                if target not in supported_targets:
+                    self.targets.remove(target)
+                    warn("Profiling {} is not supported in current context.".
+                         format(target))
+        else:
+            self.targets = supported_targets
+        profileoption = ProfilerOptions()
+        if ProfilerTarget.CPU in self.targets:
+            profileoption.trace_switch |= 1
+        if ProfilerTarget.GPU in self.targets:
+            profileoption.trace_switch |= (1 << 1)
+        wrap_optimizers()
+        self.profiler = _Profiler.create(profileoption)
+        if callable(scheduler):
+            self.scheduler = scheduler
+        elif isinstance(scheduler, (tuple, list)):
+            assert len(scheduler) == 2 and scheduler[1] > scheduler[0]
+            start_batch, end_batch = scheduler
+            start_batch = max(start_batch, 0)
+            if start_batch >= 1:
+                self.scheduler = make_scheduler(
+                    closed=max(start_batch - 1, 0),
+                    ready=1,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+            else:
+                self.scheduler = make_scheduler(
+                    closed=0,
+                    ready=0,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+        else:
+            self.scheduler = _default_state_scheduler
+
+        if on_trace_ready == None:
+            self.on_trace_ready = export_chrome_tracing('./profiler_log/')
+        else:
+            self.on_trace_ready = on_trace_ready
+        self.step_num = 0
+        self.previous_state = ProfilerState.CLOSED
+        self.current_state = self.scheduler(self.step_num)
+        self.record_event = None
+        self.profiler_result = None
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+
+    def start(self):
+        r'''
+        Start profiler and enter the first profiler step(0).
+        State transformed from CLOSED to self.current_state and trigger corresponding action. 
+        '''
+        # CLOSED -> self.current_state
+        if self.current_state == ProfilerState.READY:
+            self.profiler.prepare()
+        elif self.current_state == ProfilerState.RECORD:
+            self.profiler.prepare()
+            self.profiler.start()
+        elif self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler.prepare()
+            self.profiler.start()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def stop(self):
+        r'''
+        Stop profiler and State transformed from self.current_state to CLOSED.
+        Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
+        '''
+        # self.current_state -> CLOSED
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        if self.current_state == ProfilerState.READY:
+            warn(
+                "Inproper Profiler state transform: READY->CLOSED, profiler will start and stop without saving data"
+            )
+            self.profiler.start()
+            self.profiler.stop()
+        if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler_result = self.profiler.stop()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def step(self):
+        r"""
+        Signals the profiler that the next profiling step has started.
+        Get the new ProfilerState and trigger corresponding action.
+        """
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        self.previous_state = self.current_state
+        self.step_num += 1
+        self.current_state = self.scheduler(self.step_num)
+        self._trigger_action()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def _trigger_action(self):
+        if self.previous_state == ProfilerState.CLOSED:
+            if self.current_state == ProfilerState.READY:  # CLOSED -> READY
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # CLOSED -> RECORD
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # CLOSED -> RECORD_AND_RETURN
+                self.profiler.prepare()
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.READY:
+            if self.current_state == ProfilerState.CLOSED:  # READY -> CLOSED
+                warn(
+                    "Improper schedule: READY->CLOSED, profiler will start and stop without saving data"
+                )
+                self.profiler.start()
+                self.profiler.stop()
+            if self.current_state == ProfilerState.RECORD:  # READY -> RECORD
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # READY -> RECORD_AND_RETURN
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.RECORD:
+            if self.current_state == ProfilerState.CLOSED:  # RECORD -> CLOSED
+                warn(
+                    "Improper schedule: RECORD->CLOSED, profiler will not saving data"
+                )
+                self.profiler.stop()
+
+            if self.current_state == ProfilerState.READY:  # RECORD -> READY
+                warn(
+                    "Improper schedule: RECORD->READY, profiler will stop and re-prepare"
+                )
+                self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD -> RECORD_AND_RETURN
+                pass
+
+        else:
+            assert self.previous_state == ProfilerState.RECORD_AND_RETURN
+            if self.current_state == ProfilerState.CLOSED:  # RECORD_AND_RETURN -> CLOSED
+                self.profiler_result = self.profiler.stop()
+            if self.current_state == ProfilerState.READY:  # RECORD_AND_RETURN -> READY
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # RECORD_AND_RETURN -> RECORD
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD_AND_RETURN -> RECORD_AND_RETURN
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def export(self, path="", format="json"):
+        r"""
+        Exports the tracing data in Chrome tracing data format.
+        """
+        if self.profiler_result:
+            self.profiler_result.save(path, format)
+
+    def summary(self,
+                sorted_by=SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'):
+        r"""
+        Print the Summary table.
+
+        Parameters:
+            sorted_by: how to rank the op table items.
+            detail: expand each operator detail information.
+            thread_sep: print op table each thread.
+            time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
+        """
+        pass
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
new file mode 100755
index 0000000000000000000000000000000000000000..7400f21e91365efeaef6a03d008691bdc837131b
--- /dev/null
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -0,0 +1,824 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+from enum import Enum
+
+from paddle.fluid.core import TracerEventType
+
+from .statistic_helper import *
+
+_AllTracerEventType = [
+    TracerEventType.Operator, TracerEventType.Dataloader,
+    TracerEventType.ProfileStep, TracerEventType.CudaRuntime,
+    TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset,
+    TracerEventType.UserDefined, TracerEventType.OperatorInner,
+    TracerEventType.Forward, TracerEventType.Backward,
+    TracerEventType.Optimization, TracerEventType.Communication,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
+
+
+class SortedKeys(Enum):
+    r"""
+    Sorted keys for printing summary table.
+    """
+    CPUTotal = 0
+    CPUAvg = 1
+    CPUMax = 2
+    CPUMin = 3
+    GPUTotal = 4
+    GPUAvg = 5
+    GPUMax = 6
+    GPUMin = 7
+
+
+class HostStatisticNode:
+    r'''
+    Wrap original node for calculating statistic metrics.
+    '''
+
+    def __init__(self, hostnode):
+        self.hostnode = hostnode
+        self.children_node = []
+        self.runtime_node = []
+        self.cpu_time = 0
+        self.self_cpu_time = 0
+        self.gpu_time = 0
+        self.self_gpu_time = 0
+
+    def cal_statistic(self):
+        for child in self.children_node:
+            child.cal_statistic()
+        for rt in self.runtime_node:
+            rt.cal_statistic()
+
+        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
+        for child in self.children_node:
+            self.gpu_time += child.gpu_time
+            self.self_cpu_time -= (child.end_ns - child.start_ns)
+        for rt in self.runtime_node:
+            self.self_cpu_time -= (rt.end_ns - rt.start_ns)
+            self.gpu_time += rt.gpu_time
+            self.self_gpu_time += rt.gpu_time
+        for device in self.hostnode.device_node:
+            self.gpu_time += (device.end_ns - device.start_ns)
+            self.self_gpu_time += (device.end_ns - device.start_ns)
+
+    @property
+    def end_ns(self):
+        return self.hostnode.end_ns
+
+    @property
+    def start_ns(self):
+        return self.hostnode.start_ns
+
+    def __getattr__(self, name):
+        return getattr(self.hostnode, name)
+
+
+def traverse_tree(nodetrees):
+    results = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        threadlist = results[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+    return results
+
+
+def wrap_tree(nodetrees):
+    '''
+    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
+    '''
+    node_statistic_tree = {}
+    results = collections.defaultdict(list)
+    newresults = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        root_statistic_node = HostStatisticNode(rootnode)
+        newstack = []
+        newstack.append(root_statistic_node)
+        node_statistic_tree[thread_id] = root_statistic_node
+        threadlist = results[thread_id]
+        newthreadlist = newresults[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            current_statistic_node = newstack.pop()
+            newthreadlist.append(current_statistic_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+                child_statistic_node = HostStatisticNode(childnode)
+                current_statistic_node.children_node.append(
+                    child_statistic_node)
+                newstack.append(child_statistic_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_statistic_node = HostStatisticNode(runtimenode)
+                current_statistic_node.runtime_node.append(
+                    runtime_statistic_node)
+    # recursive calculate node statistic values
+    for thread_id, root_statistic_node in node_statistic_tree.items():
+        root_statistic_node.cal_statistic()
+
+    return node_statistic_tree, newresults
+
+
+class TimeRangeSummary:
+    r"""
+    Analyse time ranges for each TracerEventType, and summarize the time.
+    """
+
+    def __init__(self):
+        self.CPUTimeRange = collections.defaultdict(list)
+        self.GPUTimeRange = collections.defaultdict(
+            lambda: collections.defaultdict(list)
+        )  # GPU events should be divided into different devices
+        self.CPUTimeRangeSum = collections.defaultdict(int)
+        self.GPUTimeRangeSum = collections.defaultdict(
+            lambda: collections.defaultdict(int))
+        self.call_times = collections.defaultdict(int)
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis node trees in profiler result, and get time range for different tracer event type.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            CPUTimeRange = collections.defaultdict(list)
+            GPUTimeRange = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
+            )  # device_id/type/stream_id
+            for hostnode in hostnodes[1:]:  #skip root node
+                CPUTimeRange[hostnode.type].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.call_times[hostnode.type] += 1
+                if hostnode.type == TracerEventType.Operator and any([
+                        name in hostnode.name for name in _CommunicationOpName
+                ]):  # special case, communication op
+                    CPUTimeRange[TracerEventType.Communication].append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    self.call_times[TracerEventType.Communication] += 1
+                is_communication_node = (
+                    hostnode.type == TracerEventType.Communication
+                ) or (hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in _CommunicationOpName]))
+                for runtimenode in hostnode.runtime_node:
+                    CPUTimeRange[runtimenode.type].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                    self.call_times[runtimenode.type] += 1
+                    for devicenode in runtimenode.device_node:
+                        GPUTimeRange[devicenode.device_id][devicenode.type][
+                            devicenode.stream_id].append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        self.call_times[devicenode.type] += 1
+                        if is_communication_node:  # gpu activity for communication node
+                            GPUTimeRange[devicenode.device_id][
+                                TracerEventType.Communication][
+                                    devicenode.stream_id].append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                            self.call_times[TracerEventType.Communication] += 1
+
+            for event_type, time_ranges in CPUTimeRange.items():
+                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
+                self.CPUTimeRange[event_type] = merge_ranges(
+                    self.CPUTimeRange[event_type], time_ranges, is_sorted=True)
+            for device_id, device_time_ranges in GPUTimeRange.items():
+                for event_type, event_time_ranges in device_time_ranges.items():
+                    for stream_id, time_ranges in event_time_ranges.items():
+                        time_ranges = merge_self_ranges(
+                            time_ranges, is_sorted=False)
+                        self.GPUTimeRange[device_id][event_type] = merge_ranges(
+                            self.GPUTimeRange[device_id][event_type],
+                            time_ranges,
+                            is_sorted=True)
+
+        for event_type, time_ranges in self.CPUTimeRange.items():
+            self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
+        for device_id, device_time_ranges in self.GPUTimeRange.items():
+            for event_type, time_ranges in device_time_ranges.items():
+                self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
+                    time_ranges)
+
+    def get_gpu_devices(self):
+        return self.GPUTimeRange.keys()
+
+    def get_gpu_range_sum(self, device_id, event_type):
+        return self.GPUTimeRangeSum[device_id][event_type]
+
+    def get_cpu_range_sum(self, event_type):
+        return self.CPUTimeRangeSum[event_type]
+
+
+class EventSummary:
+    r"""
+    Analyse operator event in profiling data, correlate with its device event.
+    """
+
+    class DeviceItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_item(self, node):
+            self.call += 1
+            self.add_gpu_time(node.end_ns - node.start_ns)
+
+    class OperatorItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.gpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+            self.devices = {}
+            self.operator_inners = {}
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+            for child in node.children_node:
+                if child.name not in self.operator_inners:
+                    self.operator_inners[
+                        child.name] = EventSummary.OperatorItem(child.name)
+                self.operator_inners[child.name].add_item(child)
+
+            for runtimenode in node.runtime_node:
+                for devicenode in runtimenode.device_node:
+                    if devicenode.name not in self.devices:
+                        self.devices[devicenode.name] = EventSummary.DeviceItem(
+                            devicenode.name)
+                    self.devices[devicenode.name].add_item(devicenode)
+
+    class GeneralItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+
+    def __init__(self):
+        self.items = {}  # for operator summary
+        self.thread_items = collections.defaultdict(
+            dict)  # for operator summary
+        self.userdefined_items = {}  # for userdefined summary
+        self.userdefined_thread_items = collections.defaultdict(
+            dict)  # for userdefined summary
+        self.model_perspective_items = {}  # for model summary
+        self.memory_manipulation_items = {}  # for memory manipulation summary
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis operator event in the nodetress.
+        """
+        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
+        for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
+        ):
+            for host_statistic_node in host_statistic_nodes[
+                    1:]:  #skip root node
+                if host_statistic_node.type == TracerEventType.Operator:
+                    self.add_operator_item(host_statistic_node)
+                if host_statistic_node.type == TracerEventType.UserDefined\
+                    or host_statistic_node.type == TracerEventType.PythonUserDefined:
+                    if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\
+                        or 'memset' in host_statistic_node.name.lower():
+                        self.add_memory_manipulation_item(host_statistic_node)
+                    else:
+                        self.add_userdefined_item(host_statistic_node)
+
+        for threadid, root_statistic_node in node_statistic_trees.items():
+            deque = collections.deque()
+            deque.append(root_statistic_node)
+            while deque:
+                current_node = deque.popleft()
+                for child in current_node.children_node:
+                    if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\
+                        or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization:
+                        self.add_model_perspective_item(
+                            child)  #find first model perspective node
+                    else:
+                        deque.append(child)
+
+    def add_operator_item(self, operator_node):
+        if operator_node.name not in self.items:
+            self.items[operator_node.name] = EventSummary.OperatorItem(
+                operator_node.name)
+
+        self.items[operator_node.name].add_item(operator_node)
+
+        if operator_node.name not in self.thread_items[operator_node.thread_id]:
+            self.thread_items[operator_node.thread_id][
+                operator_node.name] = EventSummary.OperatorItem(
+                    operator_node.name)
+        self.thread_items[operator_node.thread_id][operator_node.name].add_item(
+            operator_node)
+
+    def add_userdefined_item(self, userdefined_node):
+        if userdefined_node.name not in self.userdefined_items:
+            self.userdefined_items[
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+
+        self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
+
+        if userdefined_node.name not in self.userdefined_thread_items[
+                userdefined_node.thread_id]:
+            self.userdefined_thread_items[userdefined_node.thread_id][
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+        self.userdefined_thread_items[userdefined_node.thread_id][
+            userdefined_node.name].add_item(userdefined_node)
+
+    def add_memory_manipulation_item(self, memory_manipulation_node):
+        if memory_manipulation_node.name not in self.memory_manipulation_items:
+            self.memory_manipulation_items[
+                memory_manipulation_node.name] = EventSummary.GeneralItem(
+                    memory_manipulation_node.name)
+        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
+            memory_manipulation_node)
+
+    def add_model_perspective_item(self, model_perspective_node):
+        if model_perspective_node.type == TracerEventType.Forward:
+            name = 'Forward'
+        elif model_perspective_node.type == TracerEventType.Backward:
+            name = 'Backward'
+        elif model_perspective_node.type == TracerEventType.Optimization:
+            name = 'Optimization'
+        elif model_perspective_node.type == TracerEventType.Dataloader:
+            name = 'Dataloader'
+        else:
+            return
+        if name not in self.model_perspective_items:
+            self.model_perspective_items[name] = EventSummary.GeneralItem(name)
+        self.model_perspective_items[name].add_item(model_perspective_node)
+
+
+class StatisticData:
+    r"""
+    Hold all analysed results.
+    """
+
+    def __init__(self, node_trees, extra_info):
+        self.node_trees = node_trees
+        self.extra_info = extra_info
+        self.time_range_summary = TimeRangeSummary()
+        self.event_summary = EventSummary()
+        self.time_range_summary.parse(node_trees)
+        self.event_summary.parse(node_trees)
+
+
+def _build_table(statistic_data,
+                 sorted_by=SortedKeys.CPUTotal,
+                 op_detail=True,
+                 thread_sep=False,
+                 time_unit='ms',
+                 row_limit=100,
+                 max_src_column_width=75):
+    """Prints a summary of events."""
+    # format table row
+    SPACING_SIZE = 2
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    def add_column(padding, text_dir='<'):
+        row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + (
+            ' ' * SPACING_SIZE)
+        header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
+        line_length_list[0] += padding + SPACING_SIZE
+
+    def add_title(padding, text):
+        left_length = padding - len(text)
+        half = left_length // 2
+        return '-' * half + text + '-' * (left_length - half)
+
+    result = []
+
+    def append(s):
+        result.append(s)
+        result.append('\n')
+
+    def format_time(time, unit='ms', indent=0):
+        r"""
+        Transform time in ns to time in unit.
+        """
+        if time == float('inf'):
+            return '-'
+        else:
+            result = float(time)
+            if unit == 's':
+                result /= 1e9
+            elif unit == 'ms':
+                result /= 1e6
+            elif unit == 'us':
+                result /= 1e3
+            return '{}{:.2f}'.format(' ' * indent, result)
+
+    def format_ratio(ratio, indent=0):
+        r"""
+        Transform ratio within [0, 1] to percentage presentation.
+        """
+        return '{}{:.2f}'.format(' ' * indent, ratio * 100)
+
+    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
+        TracerEventType.ProfileStep)
+    ###### Print Device Summary ######
+    headers = ['Device', 'Utilization (%)']
+    name_column_width = 30
+    DEFAULT_COLUMN_WIDTH = 20
+    add_column(name_column_width)
+    for _ in headers[1:]:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+
+    append(add_title(line_length, "Device Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'CPU(Process)', format_ratio(
+            float(statistic_data.extra_info['Process Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    row_values = [
+        'CPU(System)', format_ratio(
+            float(statistic_data.extra_info['System Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
+        gpu_time = float(
+            statistic_data.time_range_summary.get_gpu_range_sum(
+                gpu_name, TracerEventType.Kernel))
+        utilization = gpu_time / total_time
+        row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
+        "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
+        "GPU Utilization = Current process GPU time / elapsed time")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    if total_time == 0:
+        return ''.join(result)
+
+    ###### Print Overview Summary ######
+    headers = ['Event Type', 'CPU Time', 'Ratio (%)']
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    DEFAULT_COLUMN_WIDTH = 25
+    for _ in headers:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+    append(add_title(line_length, "Overview Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'Total Time', format_time(
+            total_time, unit=time_unit), format_ratio(1)
+    ]
+    append(row_format.format(*row_values))
+    cpu_type_time = collections.defaultdict(int)
+    gpu_type_time = collections.defaultdict(int)
+    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
+    ):
+        cpu_type_time[event_type] = value
+
+    gpu_time_range = collections.defaultdict(list)
+    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+    ):
+        for event_type, time_range in device_time_ranges.items():
+            gpu_time_range[event_type] = merge_ranges(
+                gpu_time_range[event_type], time_range, is_sorted=True)
+    for event_type, time_range in gpu_time_range.items():
+        gpu_type_time[event_type] = sum_ranges(time_range)
+
+    sorted_items = sorted(
+        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    for event_type, time in sorted_items:
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+    append(header_sep)
+    headers = ['', 'GPU Time', 'Ratio (%)']
+    append(row_format.format(*headers))
+    append(header_sep)
+    for event_type, time in gpu_type_time.items():
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
+        "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
+        "ratio = CPU(GPU) Time / Total Time."
+        "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
+        "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
+        "Example:\n"
+        "Thread 1:\n"
+        "  Operator: |___________|     |__________|\n"
+        "Thread 2:\n"
+        "  Operator:   |____________|     |___|\n"
+        "After merged:\n"
+        "  Result:   |______________|  |__________|\n")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    ###### Print Operator Summary Report ######
+    if statistic_data.event_summary.items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 50
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Operator Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            thread_items = statistic_data.event_summary.thread_items
+        else:
+            thread_items = {
+                'All threads merged': statistic_data.event_summary.items
+            }
+        for thread_id, items in thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            total_cpu_time = 0
+            total_gpu_time = 0
+            for name, item in sorted_items:
+                total_cpu_time += item.cpu_time
+                total_gpu_time += item.gpu_time
+            for name, item in sorted_items:
+                row_values = [
+                    name, item.call, '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                ]
+                append(row_format.format(*row_values))
+                if op_detail:
+                    for innerop_name, innerop_node in item.operator_inners.items(
+                    ):
+                        row_values = [
+                            '  {}'.format(innerop_name), innerop_node.call,
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.cpu_time) /
+                                    total_cpu_time)),
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+                        for device_node_name, devicenode in innerop_node.devices.items(
+                        ):
+                            if len(device_node_name) + 4 > name_column_width:
+                                device_node_name = device_node_name[:
+                                                                    name_column_width
+                                                                    - 7]
+                                device_node_name += "..."
+                            row_values = [
+                                '    {}'.format(device_node_name),
+                                devicenode.call, '- / - / - / - / -',
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(
+                                        devicenode.gpu_time, unit=time_unit),
+                                    format_time(
+                                        devicenode.avg_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.max_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.min_gpu_time,
+                                        unit=time_unit),
+                                    format_ratio(
+                                        float(devicenode.gpu_time) /
+                                        total_gpu_time))
+                            ]
+                            append(row_format.format(*row_values))
+                    for device_node_name, device_node in item.devices.items():
+                        if len(device_node_name) + 2 > name_column_width:
+                            device_node_name = device_node_name[:
+                                                                name_column_width
+                                                                - 5]
+                            device_node_name += "..."
+                        row_values = [
+                            '    {}'.format(device_node_name), devicenode.call,
+                            '- / - / - / - / -',
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    devicenode.gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(devicenode.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    return ''.join(result)
diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f11649928a7fe72b0a4ae6f368c3c92debf060c
--- /dev/null
+++ b/python/paddle/profiler/statistic_helper.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+
+def sum_ranges(ranges):
+    result = 0
+    for time_range in ranges:
+        result += (time_range[1] - time_range[0])
+    return result
+
+
+def merge_self_ranges(src_ranges, is_sorted=False):
+    merged_ranges = []
+    if len(src_ranges) > 0:
+        if not is_sorted:
+            src_ranges.sort(key=lambda x: x[0])
+        cur_indx = 0
+        merged_ranges.append((src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+        for cur_indx in range(1, len(src_ranges)):
+            if src_ranges[cur_indx][1] > merged_ranges[-1][1]:
+                if src_ranges[cur_indx][0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0],
+                                         src_ranges[cur_indx][1])
+                else:
+                    merged_ranges.append(
+                        (src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+    return merged_ranges
+
+
+def merge_ranges(range_list1, range_list2, is_sorted=False):
+    merged_ranges = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    if len1 == 0 and len2 == 0:
+        return merged_ranges
+    elif len1 == 0:
+        return range_list2
+    elif len2 == 0:
+        return range_list1
+    else:
+        indx1 = 0
+        indx2 = 0
+        range1 = range_list1[indx1]
+        range2 = range_list2[indx2]
+        if range1[0] < range2[0]:
+            merged_ranges.append(range1)
+            indx1 += 1
+        else:
+            merged_ranges.append(range2)
+            indx2 += 1
+        while indx1 < len1 and indx2 < len2:
+            range1 = range_list1[indx1]
+            range2 = range_list2[indx2]
+            if range1[0] < range2[0]:
+                if range1[1] > merged_ranges[-1][1]:
+                    if range1[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                    else:
+                        merged_ranges.append((range1[0], range1[1]))
+                    indx1 += 1
+                else:
+                    indx1 += 1
+            else:
+                if range2[1] > merged_ranges[-1][1]:
+                    if range2[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                    else:
+                        merged_ranges.append((range2[0], range2[1]))
+                    indx2 += 1
+                else:
+                    indx2 += 1
+
+        while indx1 < len1:
+            range1 = range_list1[indx1]
+            if range1[1] > merged_ranges[-1][1]:
+                if range1[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                else:
+                    merged_ranges.append((range1[0], range1[1]))
+                indx1 += 1
+            else:
+                indx1 += 1
+        while indx2 < len2:
+            if range2[1] > merged_ranges[-1][1]:
+                if range2[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                else:
+                    merged_ranges.append((range2[0], range2[1]))
+                indx2 += 1
+            else:
+                indx2 += 1
+    return merged_ranges
+
+
+def intersection_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if len(range_list1) == 0 or len(range_list2) == 0:
+        return result_range
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+    while indx1 < len1 and indx2 < len2:
+        if range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            assert (range2[1] > range1[0])
+            result_range.append((range1[0], range2[1]))
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            result_range.append(range1)
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        elif range2[1] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append(range2)
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] < range1[1]:
+            assert (range2[1] >= range1[1])
+            result_range.append((range2[0], range1[1]))
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        else:
+            assert (range2[0] >= range1[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+    return result_range
+
+
+def subtract_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    if len(range_list1) == 0:
+        return result_range
+    if len(range_list2) == 0:
+        return range_list1
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+
+    while indx1 < len(range_list1):
+        if indx2 == len(range_list2):
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+        elif range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+        elif range2[0] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append((range1[0], range2[0]))
+            range1 = (range2[0], range1[1])
+        else:
+            assert (range2[0] >= range1[1])
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+    return result_range
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..642001dfbfc5a307d5064860136034ba7b3bdbc5
--- /dev/null
+++ b/python/paddle/profiler/utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.core import (_RecordEvent, TracerEventType,
+                               load_profiler_result)
+from typing import Any
+from warnings import warn
+import functools
+from contextlib import ContextDecorator
+
+_AllowedEventTypeList = [
+    TracerEventType.Dataloader, TracerEventType.ProfileStep,
+    TracerEventType.UserDefined, TracerEventType.Forward,
+    TracerEventType.Backward, TracerEventType.Optimization,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+
+class RecordEvent(ContextDecorator):
+    r"""
+    Interface for recording a time range.
+
+    Parameters:
+    name(str): Name of the record event
+    event_type(TracerEventType): Type of the record event, can be used for statistics.
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
+            op1()
+    """
+
+    def __init__(self,
+                 name: str,
+                 event_type: TracerEventType=TracerEventType.UserDefined):
+        self.name = name
+        self.event_type = event_type
+        self.event = None
+
+    def __enter__(self):
+        self.begin()
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
+        self.end()
+
+    def begin(self):
+        if self.event_type not in _AllowedEventTypeList:
+            warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
+                  can be recorded.".format(*_AllowedEventTypeList))
+            self.event = None
+        else:
+            if self.event_type == TracerEventType.UserDefined:
+                self.event_type == TracerEventType.PythonUserDefined
+            self.event = _RecordEvent(self.name, self.event_type)
+
+    def end(self):
+        if self.event:
+            self.event.end()
+
+
+def wrap_optimizers():
+    def optimizer_warpper(func):
+        @functools.wraps(func)
+        def warpper(*args, **kwargs):
+            with RecordEvent(
+                    'Optimization Step',
+                    event_type=TracerEventType.Optimization):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    import paddle.optimizer as optimizer
+    for classname in optimizer.__all__:
+        if classname != 'Optimizer':
+            classobject = getattr(optimizer, classname)
+            if getattr(classobject, 'step', None) != None:
+                classobject.step = optimizer_warpper(classobject.step)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index ae563e641e3c854e6d516ada20beb2dafb151578..6555ba0812d08c0ca3a21641b5b28d5a3763f2c4 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -110,12 +110,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace"
         )
 
-    #Todo(zhouwei): Support allocate tensor on any other specified card
-    if isinstance(place, core.CUDAPlace) and isinstance(
-            _current_expected_place(), core.CUDAPlace) and place._get_device_id(
-            ) != _current_expected_place()._get_device_id():
-        place = _current_expected_place()
-
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
@@ -139,7 +133,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             data.stop_gradient = stop_gradient
             return data
         elif isinstance(data, (core.LoDTensor, core.Tensor)):
-            # Note(zhouwei25): should't expose it to users, just for internal use.
+            # should't expose it to users, just for internal use.
             # convert core.Tensor/core.LoDTensor to VarBase first
             # Currenly, there is no copy when places are same
             data = paddle.Tensor(data)
@@ -152,15 +146,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
                 format(type(data)))
-        if not dtype and data.dtype in [
-                'float16', 'float32', 'float64', 'complex64', 'complex128'
-        ]:
-            default_type = paddle.get_default_dtype()
-            if np.iscomplexobj(data):
-                default_type = 'complex64' if default_type in [
-                    'float16', 'float32'
-                ] else 'complex128'
-            data = data.astype(default_type)
+        if not dtype:
+            if data.dtype in [
+                    'float16', 'float32', 'float64', 'complex64', 'complex128'
+            ]:
+                default_type = paddle.get_default_dtype()
+                if np.iscomplexobj(data):
+                    default_type = 'complex64' if default_type in [
+                        'float16', 'float32'
+                    ] else 'complex128'
+                data = data.astype(default_type)
+            # Windows default type is 'int32', while Linux/Mac is 'int64'. Unify they.
+            if data.dtype in ['int32']:
+                default_type = "int64"
+                data = data.astype(default_type)
 
     if dtype and convert_dtype(dtype) != data.dtype:
         data = data.astype(convert_dtype(dtype))
@@ -284,7 +283,7 @@ def ones_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
@@ -359,7 +358,7 @@ def zeros_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 53bb9a8807562866d810bcf36a0329b7cadd7ebd..32ccecbc6d9f0282b86f100e1b910667fab41cb2 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None):
         return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
 
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        x, 'x',
+        ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'gather')
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
 
@@ -2737,9 +2738,10 @@ def moveaxis(x, source, destination, name=None):
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
-    check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'moveaxis')
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'moveaxis')
 
     helper = LayerHelper('moveaxis', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ce29e9dce81809a9745d6efbe6da419878423e00..9a0139105651b53781f9c76189abb1b7d8ddefe9 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -27,7 +27,7 @@ from paddle.tensor import cast
 from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
 from paddle.static import Variable
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
@@ -1083,6 +1083,8 @@ def trunc(input, name=None):
             #         [0., 0.]]))
     '''
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return  _C_ops.final_state_trunc(input)
         return _C_ops.trunc(input)
     else:
         inputs = {"X": input}
@@ -2425,6 +2427,8 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_diagonal(x, offset, axis1, axis2)
         return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     def __check_input(input, offset, dim1, dim2):
@@ -3184,6 +3188,8 @@ def digamma(x, name=None):
     """
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_digamma(x)
         return _C_ops.digamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma')
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index ecf70ffe4a1dd3179d02a2a6ca1e260e8193d1d1..0ba47d79050ce2bc9ba4842681825f47f059c5df 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -542,7 +542,7 @@ def where(condition, x=None, y=None, name=None):
 
 
     Args:
-        condition(Tensor): The condition to choose x or y.
+        condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y.
         x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
         y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
 
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 468aa460486275f78d240bdee40b9d73f07dbcda..dd0da03e4fd2816bcd28ea76cc4a0712451c3e39 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -321,6 +321,9 @@ def median(x, axis=None, keepdim=False, name=None):
             paddle.slice(
                 tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]),
             dtype=dtype)
+    out_tensor = out_tensor + paddle.sum(
+        paddle.cast(
+            paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
     if not keepdim or is_flatten:
         if not is_flatten:
             newshape = x.shape[:axis] + x.shape[axis + 1:]
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index bbde64f2e609cd511ca17b3a3c3e8d11712c84a5..4441faee14e021ab4c6d45c6bad36653e79db37e 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -61,6 +61,8 @@ class TestPretrainedModel(unittest.TestCase):
         arches = [
             'mobilenet_v1',
             'mobilenet_v2',
+            'mobilenet_v3_small',
+            'mobilenet_v3_large',
             'squeezenet1_0',
             'shufflenet_v2_x0_25',
         ]
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 547c53345995c28c0f4b4fbdfa9ae17477f09864..dc98fc3219bff6d2ae5e65a4ad21e4303baba8c7 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -40,6 +40,12 @@ class TestVisonModels(unittest.TestCase):
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
 
+    def test_mobilenetv3_small(self):
+        self.models_infer('mobilenet_v3_small')
+
+    def test_mobilenetv3_large(self):
+        self.models_infer('mobilenet_v3_large')
+
     def test_vgg11(self):
         self.models_infer('vgg11')
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 7ea8493b67fd6dec6f46df8ca854bbd700ffbfa6..6c27d465cb12e3c89391608f5ea9871a5a42ddef 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -121,7 +121,7 @@
   backward : matmul_grad
 
 - api : mean
-  args : (Tensor x, int64_t[] axis={}, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : ReduceInferMeta
@@ -181,7 +181,7 @@
     func : subtract
 
 - api : sum
-  args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : SumInferMeta
@@ -193,3 +193,49 @@
   args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
+
+- api : digamma
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : digamma
+  backward : digamma_grad
+
+- api : abs
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : abs
+  backward : abs_grad
+
+- api : trunc
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : trunc
+  backward : trunc_grad
+
+# - api : norm
+#   args : (Tensor x, int axis, float epsilon, bool is_test)
+#   output : Tensor(out), Tensor(norm)
+#   infer_meta :
+#     func : NormInferMeta
+#   kernel :
+#     func : norm
+#   intermediate : norm
+#   backward : norm_grad
+
+- api : diagonal
+  args : (Tensor x, int offset, int axis1, int axis2)
+  output : Tensor
+  infer_meta :
+    func : DiagonalInferMeta
+  kernel :
+    func : diagonal
+  backward : diagonal_grad
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 5fc9dfe3f6499701f75fffc62bdcf3f9a0c28821..fe68548a22a6d90bececdd00ac75d760969cee92 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -43,7 +43,9 @@ class BaseAPI(object):
             self.is_base_api = False
             self.invoke = api_item_yaml['invoke']
         else:
-            self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta'])
+            if 'infer_meta' in api_item_yaml:
+                self.infer_meta = self.parse_infer_meta(api_item_yaml[
+                    'infer_meta'])
             self.kernel = self.parse_kernel(api_item_yaml['kernel'])
             self.support_selected_rows_kernel = False if len(self.kernel[
                 'func']) == 1 else True
@@ -87,18 +89,20 @@ class BaseAPI(object):
         attr_types_map = {
             'ScalarArray': 'const ScalarArray&',
             'Scalar': 'const Scalar&',
+            'uint8': 'uint8_t',
             'int': 'int',
-            'int32_t': 'int32_t',
-            'int64_t': 'int64_t',
+            'int32': 'int32_t',
+            'int64': 'int64_t',
             'long': 'long',
             'size_t': 'size_t',
             'float': 'float',
             'double': 'double',
             'bool': 'bool',
+            'str': 'const std::string&',
             'Backend': 'Backend',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
-            'int64_t[]': 'const std::vector<int64_t>&',
+            'int64[]': 'const std::vector<int64_t>&',
             'int[]': 'const std::vector<int>&',
             'long[]': 'const std::vector<int64_t>&'
         }
@@ -108,8 +112,8 @@ class BaseAPI(object):
             'ScalarArray': 'const paddle::optional<ScalarArray>&',
             'Scalar': 'const paddle::optional<Scalar>&',
             'int': 'paddle::optional<int>',
-            'int32_t': 'paddle::optional<int32_t>',
-            'int64_t': 'paddle::optional<int64_t>',
+            'int32': 'paddle::optional<int32_t>',
+            'int64': 'paddle::optional<int64_t>',
             'size_t': 'paddle::optional<size_t>',
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
@@ -117,7 +121,7 @@ class BaseAPI(object):
             'Backend': 'paddle::optional<Backend>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>',
-            'int64_t[]': 'paddle::optional<std::vector<int64_t>>',
+            'int64[]': 'paddle::optional<std::vector<int64_t>>',
             'int[]': 'paddle::optional<std::vector<int>>'
         }
 
@@ -182,9 +186,9 @@ class BaseAPI(object):
                 'Tensor': 'Tensor',
                 'Tensor[]': 'std::vector<Tensor>'
             }
-            if re.search(r'\(\w*\)', output_item):
+            if re.search(r'\([a-zA-Z0-9_@]*\)', output_item):
                 result = re.search(
-                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)",
+                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>[a-zA-Z0-9_@]+)\)",
                     output_item)
                 out_type = result.group('out_type')
                 assert out_type in output_type_map, \
@@ -297,12 +301,12 @@ class BaseAPI(object):
 
     def gene_api_declaration(self):
         api_declaration = f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
 """
 
         if self.is_base_api and self.inplace_map is not None:
             api_declaration = api_declaration + f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
 """
 
         return api_declaration
@@ -451,7 +455,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         param_code = ""
         for param in infer_meta_params:
             if param in input_names:
-                if param in self.optional_vars:
+                if self.inputs['input_info'][param] == "const Tensor&":
+                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                elif self.inputs['input_info'][
+                        param] == "const std::vector<Tensor>&":
+                    meta_tensor_code = meta_tensor_code + f"""
+{code_indent}  auto {param}_meta_vec = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
+{code_indent}  std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
+{code_indent}  for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
+{code_indent}    {param}_metas[i] = &{param}_meta_vec[i];
+{code_indent}  }}
+"""
+
+                    param_code = param_code + param + "_metas, "
+                elif param in self.optional_vars:
                     meta_tensor_code = meta_tensor_code + f"""
 {code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
 {code_indent}  auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
@@ -461,7 +478,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
                     param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
                 else:
-                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                    raise ValueError(
+                        f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
+                    )
             elif param in kernel_output_names:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + param.replace(
                     'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
@@ -484,11 +503,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::DenseTensor&',
-            'const Tensor &': 'const phi::DenseTensor&',
             'const std::vector<Tensor>&':
-            'const std::vector<phi::DenseTensor>&',
-            'const std::vector<Tensor> &':
-            'const std::vector<phi::DenseTensor>&',
+            'const std::vector<const phi::DenseTensor*>&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::DenseTensor&>',
             'const paddle::optional<std::vector<Tensor>>&':
@@ -525,9 +541,22 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  }}"""
 
                 else:
-                    input_tensor_code = input_tensor_code + f"""
+                    if self.inputs['input_info'][input_name] == "const Tensor&":
+                        input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        input_tensor_code = input_tensor_code + f"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
+{code_indent}  std::vector<const phi::DenseTensor*> {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size());
+{code_indent}  for (size_t i = 0; i < {PREFIX_TENSOR_NAME}{input_name}.size(); ++i) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name}[i] = &{PREFIX_TENSOR_NAME}{input_name}_vec->at(i);
+{code_indent}  }}"""
+
+                    else:
+                        # do nothing
+                        pass
             else:
                 if input_name in self.optional_vars:
                     input_tensor_code = input_tensor_code + f"""
@@ -547,7 +576,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                 if param in self.optional_vars:
                     kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
                 else:
-                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    if self.inputs['input_info'][param] == "const Tensor&":
+                        kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                    else:
+                        # do nothing
+                        pass
                 kernel_in_type = input_trans_map[input_infos[param]]
                 kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
@@ -577,7 +613,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
-            'const Tensor &': 'const phi::SelectedRows&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::SelectedRows&>'
         }
@@ -640,6 +675,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
         return input_tensor_code, kernel_args[:-2], kernel_signature
 
+    # Override by child class
+    def gene_return_type_code(self):
+        return self.outputs['return_type']
+
+    # Override by child class
+    def gene_return_code(self):
+        return "api_output"
+
     # Override by child class
     def gene_output(self,
                     output_type_list,
@@ -668,7 +711,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -691,12 +734,12 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         api_code = f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
 {self.gene_kernel_select()}
 """
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index a26630ad04100fbebdb7c270b83912bb722040d4..a404fc01784154900a7c6ac1df501424dcdb307e 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -23,7 +23,8 @@ from api_base import BaseAPI
 class ForwardAPI(BaseAPI):
     def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
-        self.is_dygraph_api = self.parse_intermediate(api_item_yaml)
+        self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -33,15 +34,47 @@ class ForwardAPI(BaseAPI):
 
     def parse_intermediate(self, api_item_yaml):
         if 'intermediate' in api_item_yaml:
-            return True
+            intermediate_outs = [
+                item.strip()
+                for item in api_item_yaml['intermediate'].split(',')
+            ]
+            return True, intermediate_outs
         else:
-            return False
+            return False, []
 
     def get_return_type(self, out_type_list):
         return out_type_list[0] if len(
             out_type_list) == 1 else "std::tuple<" + ",".join(
                 out_type_list) + ">"
 
+    def gene_return_type_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return self.outputs['return_type']
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(self.outputs['types'][i])
+            return return_out_list[0] if len(
+                return_out_list) == 1 else "std::tuple<" + ",".join(
+                    return_out_list) + ">"
+
+    def gene_return_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return "api_output"
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(i)
+            if len(return_out_list) == 1:
+                return f"std::get<{return_out_list[0]}>(api_output)"
+            else:
+                selected_code = [
+                    f"std::get<{i}>(api_output)" for i in return_out_list
+                ]
+            return '{' + ", ".join(selected_code) + '}'
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -58,12 +91,12 @@ class ForwardAPI(BaseAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out;"""
+{code_indent}  {self.outputs['return_type']} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -71,10 +104,10 @@ class ForwardAPI(BaseAPI):
                 if inplace_flag and self.inplace_map is not None and self.outputs[
                         'names'][i] in self.inplace_map:
                     output_create = output_create + f"""
-{code_indent}  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(out));"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(api_output));"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -105,7 +138,7 @@ def source_include(header_file_path):
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
@@ -114,7 +147,6 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/declarations.h"
 """
 
 
@@ -169,6 +201,10 @@ def generate_api(api_yaml_path, header_file_path, source_file_path,
         if foward_api.is_dygraph_api:
             dygraph_header_file.write(foward_api.gene_api_declaration())
             dygraph_source_file.write(foward_api.gene_api_code())
+
+            foward_api.is_dygraph_api = False
+            header_file.write(foward_api.gene_api_declaration())
+            source_file.write(foward_api.gene_api_code())
         else:
             header_file.write(foward_api.gene_api_declaration())
             source_file.write(foward_api.gene_api_code())
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index cdda5cb1f05e84f05f468837dae1f59116fa293f..c69bbf35b97263fb2c153839ac0105427a87e118 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -25,6 +25,61 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
+- backward_api : digamma_grad
+  forward : digamma (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : digamma_grad
+
+- backward_api : abs_grad
+  forward : abs (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : abs_grad
+
+- backward_api : trunc_grad
+  forward : trunc (Tensor x) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : trunc_grad
+
+# - backward_api : norm_grad
+#   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
+#   args : (Tensor out_grad, Tensor x, Tensor norm, int axis, float epsilon, bool is_test)
+#   output : Tensor(x_grad)
+#   infer_meta :
+#     func : UnchangedInferMeta
+#     param : [x]
+#   kernel :
+#     func : norm_grad
+
+- backward_api : diagonal_grad
+  forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : diagonal_grad
+
+# - backward_api : split_grad
+#   forward : split (Tensor x, ScalarArray num_or_sections, Scalar axis) -> Tensor[](out)
+#   args : (Tensor[] out_grad, Scalar axis)
+#   output : Tensor(x_grad)    
+#   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
 # - backward_api : matmul_triple_grad
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 2d33cd5b1812ada8fca118c0e0f616cfbe511dd1..7417d6bb030da095daf29db080b524db034cdcc6 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -35,6 +35,7 @@ class BackwardAPI(BaseAPI):
             forward_config)
         api = result.group('api')
         _, outputs, _ = self.parse_output(self.api, result.group('outputs'))
+        outputs = [item.split('@')[0] for item in outputs]
         fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
             api, result.group('args'))
 
@@ -56,8 +57,9 @@ class BackwardAPI(BaseAPI):
 
         # check the attributes of backward
         for attr in self.attrs['names']:
-            assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
-                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+            assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
+                 self.attrs['attr_info'][attr][1] is not None, \
+                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
                  Please check the args of {self.api} in yaml."
 
         # check the output of backward
@@ -85,33 +87,33 @@ class BackwardAPI(BaseAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out({len(output_type_list)});"""
+{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
                 if out_type_item == 'Tensor':
-                    get_out_code = f'&out[{i}][0]'
+                    get_out_code = f'&api_output[{i}][0]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
 
                     else:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back();"""
+{code_indent}  api_output[{i}].emplace_back();"""
 
                 else:
-                    get_out_code = f'&out[{i}]'
+                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
 {code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {get_out_code});"""
@@ -145,7 +147,7 @@ def source_include(header_file_path):
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c859022e8ad1d910ecf44426b2850496e793cee
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -0,0 +1,21 @@
+- api : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  kernel :
+    func : sparse_conv3d
+    layout : x
+
+- api : to_dense
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@DenseTensor)
+  invoke : to_dense_impl(x, backend)
+
+- api : to_sparse_coo
+  args : (Tensor x, Backend backend, int64 sparse_dim)
+  output : Tensor(out@SparseCooTensor)
+  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+
+- api : to_sparse_csr
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@SparseCsrTensor)
+  invoke : to_sparse_csr_impl(x, backend)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd22e16dc64f0a12c3575b6f2a0d2c21cd97955b
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from api_gen import ForwardAPI
+
+
+class SparseAPI(ForwardAPI):
+    def __init__(self, api_item_yaml):
+        super(SparseAPI, self).__init__(api_item_yaml)
+
+    def get_api_func_name(self):
+        return self.api
+
+    def gene_api_declaration(self):
+        return f"""
+// {", ".join(self.outputs['names'])}
+PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+"""
+
+    def get_kernel_tensor_out_type(self, output_name):
+        sparse_type = 'TensorType::DENSE_TENSOR'
+        if output_name.endswith('@SparseCooTensor'):
+            sparse_type = 'TensorType::SPARSE_COO'
+        elif output_name.endswith('@SparseCsrTensor'):
+            sparse_type = 'TensorType::SPARSE_CSR'
+        return sparse_type
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} api_output;"""
+
+            for i in range(len(output_type_list)):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+    def gen_sparse_kernel_context(self, kernel_output_names):
+        input_trans_map = {
+            'const Tensor&': 'const phi::TenseBase&',
+            'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::TenseBase&>'
+        }
+        out_trans_map = {
+            'Tensor': 'phi::TenseBase*',
+            'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
+        }
+        input_names = self.inputs['names']
+        input_infos = self.inputs['input_info']
+
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        kernel_context_code = ""
+        for param in kernel_param:
+            if param in input_names:
+                if param in self.optional_vars:
+                    raise ValueError(
+                        f"{self.api} : Unsupport optional input({param}) for sparse api."
+                    )
+                else:
+                    kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackInput({param}.impl().get());"""
+
+                continue
+            if param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::ScalarArray(' + param + ')'
+                elif 'Scalar' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::Scalar(' + param + ')'
+            elif isinstance(param, bool):
+                param = str(param).lower()
+            else:
+                param + str(param) + ", "
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+        for out_name in kernel_output_names:
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackOutput({out_name});"""
+
+        return kernel_context_code
+
+    def gen_sparse_kernel_code(self, inplace_flag=False):
+        _, kernel_output_names, output_create = self.gene_output(
+            self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
+
+        kernel_context_code = self.gen_sparse_kernel_context(
+            kernel_output_names)
+
+        return f"""
+  auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = phi::KernelContext(dev_ctx);
+{output_create}
+{kernel_context_code}
+  phi_kernel(&kernel_context);
+
+  return api_output;"""
+
+    def gene_base_api_code(self, inplace_flag=False):
+        api_func_name = self.get_api_func_name()
+        return f"""
+PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+{self.gene_kernel_select()}
+{self.gen_sparse_kernel_code(inplace_flag)}
+}}
+"""
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/data_transform.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/include/sparse_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_api = SparseAPI(api)
+        header_file.write(sparse_api.gene_api_declaration())
+        source_file.write(sparse_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/include/sparse_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6532f103cbf86288ffc739656440dc378d48eb2d
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -0,0 +1,6 @@
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
+  kernel :
+    func : sparse_conv3d_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..561e198a41b99c2362f5e14ac7fcd6da051c8875
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from sparse_api_gen import SparseAPI
+from backward_api_gen import BackwardAPI
+
+
+class SparseBackwardAPI(SparseAPI, BackwardAPI):
+    def __init__(self, bw_api_item_yaml):
+        BackwardAPI.__init__(self, bw_api_item_yaml)
+
+    def get_api_func_name(self):
+        return self.api
+
+    def get_return_type(self, out_type_list):
+        return BackwardAPI.get_return_type(self, out_type_list)
+
+    def gene_api_declaration(self):
+        return SparseAPI.gene_api_declaration(self)
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} api_output({len(output_type_list)});"""
+
+            for i, out_type_item in enumerate(output_type_list):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if out_type_item == 'Tensor':
+                    get_out_code = f'&api_output[{i}][0]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+
+                    else:
+                        output_create = output_create + f"""
+  api_output[{i}].emplace_back();"""
+
+                else:
+                    get_out_code = f'&api_output[{i}]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+
+def header_include():
+    return """
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/backward/sparse_bw_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_bw_api = SparseBackwardAPI(api)
+        header_file.write(sparse_bw_api.gene_api_declaration())
+        source_file.write(sparse_bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_bw_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/backward/sparse_bw_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_bw_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index 37520175a719f7aedabf23e89b4436bb04469bd7..3749e0f64fc6a83433dd0533cc91736503a7495b 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -40,6 +40,10 @@ from .models import MobileNetV1  # noqa: F401
 from .models import mobilenet_v1  # noqa: F401
 from .models import MobileNetV2  # noqa: F401
 from .models import mobilenet_v2  # noqa: F401
+from .models import MobileNetV3Small  # noqa: F401
+from .models import MobileNetV3Large  # noqa: F401
+from .models import mobilenet_v3_small  # noqa: F401
+from .models import mobilenet_v3_large  # noqa: F401
 from .models import SqueezeNet  # noqa: F401
 from .models import squeezenet1_0  # noqa: F401
 from .models import squeezenet1_1  # noqa: F401
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 044be6a42b7c28699747910bd610c86103a32d73..5ff3562e56ea8c2984d14cc9da00bdaccba9017b 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -24,6 +24,10 @@ from .mobilenetv1 import MobileNetV1  # noqa: F401
 from .mobilenetv1 import mobilenet_v1  # noqa: F401
 from .mobilenetv2 import MobileNetV2  # noqa: F401
 from .mobilenetv2 import mobilenet_v2  # noqa: F401
+from .mobilenetv3 import MobileNetV3Small  # noqa: F401
+from .mobilenetv3 import MobileNetV3Large  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_small  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_large  # noqa: F401
 from .vgg import VGG  # noqa: F401
 from .vgg import vgg11  # noqa: F401
 from .vgg import vgg13  # noqa: F401
@@ -79,6 +83,10 @@ __all__ = [ #noqa
     'mobilenet_v1',
     'MobileNetV2',
     'mobilenet_v2',
+    'MobileNetV3Small',
+    'MobileNetV3Large',
+    'mobilenet_v3_small',
+    'mobilenet_v3_large',
     'LeNet',
     'DenseNet',
     'densenet121',
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 74071fc121688eafbf17833a6410b94d34191ec4..6c486037c7d30539eb7996f836028be2109413fd 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle
-
 import paddle.nn as nn
-import paddle.nn.functional as F
-
 from paddle.utils.download import get_weights_path_from_url
 
+from .utils import _make_divisible
+
 __all__ = []
 
 model_urls = {
@@ -29,16 +27,6 @@ model_urls = {
 }
 
 
-def _make_divisible(v, divisor, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
 class ConvBNReLU(nn.Sequential):
     def __init__(self,
                  in_planes,
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7ae010c58f6babbe5a949c642cdbcace4e951c
--- /dev/null
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.utils.download import get_weights_path_from_url
+from functools import partial
+
+from .utils import _make_divisible
+from ..ops import ConvNormActivation
+
+__all__ = []
+
+model_urls = {
+    "mobilenet_v3_small_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_small_x1.0.pdparams",
+     "34fe0e7c1f8b00b2b056ad6788d0590c"),
+    "mobilenet_v3_large_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_large_x1.0.pdparams",
+     "118db5792b4e183b925d8e8e334db3df"),
+}
+
+
+class SqueezeExcitation(nn.Layer):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., paddle.nn.Layer], optional): ``delta`` activation. Default: ``paddle.nn.ReLU``
+        scale_activation (Callable[..., paddle.nn.Layer]): ``sigma`` activation. Default: ``paddle.nn.Sigmoid``
+    """
+
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 activation=nn.ReLU,
+                 scale_activation=nn.Sigmoid):
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+        self.fc1 = nn.Conv2D(input_channels, squeeze_channels, 1)
+        self.fc2 = nn.Conv2D(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input):
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+    def forward(self, input):
+        scale = self._scale(input)
+        return scale * input
+
+
+class InvertedResidualConfig:
+    def __init__(self,
+                 in_channels,
+                 kernel,
+                 expanded_channels,
+                 out_channels,
+                 use_se,
+                 activation,
+                 stride,
+                 scale=1.0):
+        self.in_channels = self.adjust_channels(in_channels, scale=scale)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(
+            expanded_channels, scale=scale)
+        self.out_channels = self.adjust_channels(out_channels, scale=scale)
+        self.use_se = use_se
+        if activation is None:
+            self.activation_layer = None
+        elif activation == "relu":
+            self.activation_layer = nn.ReLU
+        elif activation == "hardswish":
+            self.activation_layer = nn.Hardswish
+        else:
+            raise RuntimeError("The activation function is not supported: {}".
+                               format(activation))
+        self.stride = stride
+
+    @staticmethod
+    def adjust_channels(channels, scale=1.0):
+        return _make_divisible(channels * scale, 8)
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, expanded_channels, out_channels,
+                 filter_size, stride, use_se, activation_layer, norm_layer):
+        super().__init__()
+        self.use_res_connect = stride == 1 and in_channels == out_channels
+        self.use_se = use_se
+        self.expand = in_channels != expanded_channels
+
+        if self.expand:
+            self.expand_conv = ConvNormActivation(
+                in_channels=in_channels,
+                out_channels=expanded_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer)
+
+        self.bottleneck_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            groups=expanded_channels,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer)
+
+        if self.use_se:
+            self.mid_se = SqueezeExcitation(
+                expanded_channels,
+                _make_divisible(expanded_channels // 4),
+                scale_activation=nn.Hardsigmoid)
+
+        self.linear_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_layer=norm_layer,
+            activation_layer=None)
+
+    def forward(self, x):
+        identity = x
+        if self.expand:
+            x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.use_res_connect:
+            x = paddle.add(identity, x)
+        return x
+
+
+class MobileNetV3(nn.Layer):
+    """MobileNetV3 model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        config (list[InvertedResidualConfig]): MobileNetV3 depthwise blocks config.
+        last_channel (int): The number of channels on the penultimate layer.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+    """
+
+    def __init__(self,
+                 config,
+                 last_channel,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True):
+        super().__init__()
+
+        self.config = config
+        self.scale = scale
+        self.last_channel = last_channel
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self.firstconv_in_channels = config[0].in_channels
+        self.lastconv_in_channels = config[-1].in_channels
+        self.lastconv_out_channels = self.lastconv_in_channels * 6
+        norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99)
+
+        self.conv = ConvNormActivation(
+            in_channels=3,
+            out_channels=self.firstconv_in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            activation_layer=nn.Hardswish,
+            norm_layer=norm_layer)
+
+        self.blocks = nn.Sequential(*[
+            InvertedResidual(
+                in_channels=cfg.in_channels,
+                expanded_channels=cfg.expanded_channels,
+                out_channels=cfg.out_channels,
+                filter_size=cfg.kernel,
+                stride=cfg.stride,
+                use_se=cfg.use_se,
+                activation_layer=cfg.activation_layer,
+                norm_layer=norm_layer) for cfg in self.config
+        ])
+
+        self.lastconv = ConvNormActivation(
+            in_channels=self.lastconv_in_channels,
+            out_channels=self.lastconv_out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            norm_layer=norm_layer,
+            activation_layer=nn.Hardswish)
+
+        if with_pool:
+            self.avgpool = nn.AdaptiveAvgPool2D(1)
+
+        if num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(self.lastconv_out_channels, self.last_channel),
+                nn.Hardswish(),
+                nn.Dropout(p=0.2),
+                nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.lastconv(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+
+        return x
+
+
+class MobileNetV3Small(MobileNetV3):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Small
+
+            # build model
+            model = MobileNetV3Small(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, True, "relu", 2, scale),
+            InvertedResidualConfig(16, 3, 72, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 88, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 96, 40, True, "hardswish", 2, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 144, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 288, 96, True, "hardswish", 2, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+        ]
+        last_channel = _make_divisible(1024 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+class MobileNetV3Large(MobileNetV3):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Large
+
+            # build model
+            model = MobileNetV3Large(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, False, "relu", 1, scale),
+            InvertedResidualConfig(16, 3, 64, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 72, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 72, 40, True, "relu", 2, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 3, 240, 80, False, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(80, 3, 200, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 480, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 3, 672, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 5, 672, 160, True, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+        ]
+        last_channel = _make_divisible(1280 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs):
+    if arch == "mobilenet_v3_large":
+        model = MobileNetV3Large(scale=scale, **kwargs)
+    else:
+        model = MobileNetV3Small(scale=scale, **kwargs)
+    if pretrained:
+        arch = "{}_x{}".format(arch, scale)
+        assert (
+            arch in model_urls
+        ), "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
+
+
+def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_small
+
+            # build model
+            model = mobilenet_v3_small()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_small(pretrained=True)
+
+            # build mobilenet v3 small model with scale=0.5
+            model = mobilenet_v3_small(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs)
+    return model
+
+
+def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_large
+
+            # build model
+            model = mobilenet_v3_large()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_large(pretrained=True)
+
+            # build mobilenet v3 large model with scale=0.5
+            model = mobilenet_v3_large(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs)
+    return model
diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f61d0d601a44f5b47b5f36a1488a736f28df298c
--- /dev/null
+++ b/python/paddle/vision/models/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    """
+    This function ensures that all layers have a channel number that is divisible by divisor
+    You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505
+
+    Args:
+        divisor (int): The divisor for number of channels. Default: 8.
+        min_value (int, optional): The minimum value of number of channels, if it is None,
+                the default is divisor. Default: None.
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 03060e92bdb69b1ec6022d887d01c514cb11b45d..b65bfa502c4dfefdc446d9661a6dc6dfbfa0872d 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -17,7 +17,7 @@ from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
 from ..fluid.layers import nn, utils
-from ..nn import Layer
+from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D
 from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
@@ -195,7 +195,7 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode() and gt_score is None:
-        loss = _C_ops.yolov3_loss(
+        loss, _, _ = _C_ops.yolov3_loss(
             x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask,
             'class_num', class_num, 'ignore_thresh', ignore_thresh,
             'downsample_ratio', downsample_ratio, 'use_label_smooth',
@@ -1297,3 +1297,57 @@ class RoIAlign(Layer):
             output_size=self._output_size,
             spatial_scale=self._spatial_scale,
             aligned=aligned)
+
+
+class ConvNormActivation(Sequential):
+    """
+    Configurable block used for Convolution-Normalzation-Activation blocks.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None,
+            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
+            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d``
+        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
+            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=None,
+                 groups=1,
+                 norm_layer=BatchNorm2D,
+                 activation_layer=ReLU,
+                 dilation=1,
+                 bias=None):
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        if bias is None:
+            bias = norm_layer is None
+        layers = [
+            Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=bias)
+        ]
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            layers.append(activation_layer())
+        super().__init__(*layers)
diff --git a/python/setup.py.in b/python/setup.py.in
index ec1b1cbcb9510c80a42dff49fa1a5121a9cb487f..689f63c0f00e95e3eb861ca1c497685babd01638 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -280,6 +280,7 @@ packages=['paddle',
           'paddle.incubate.nn',
           'paddle.incubate.passes',
           'paddle.distribution',
+          'paddle.distributed.sharding',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
@@ -372,6 +373,7 @@ packages=['paddle',
           'paddle.device',
           'paddle.device.cuda',
           'paddle.version',
+          'paddle.profiler'
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -503,6 +505,18 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_ONNXRUNTIME}' == 'ON':
+    shutil.copy('${ONNXRUNTIME_SHARED_LIB}', libs_path)
+    if os.name == 'nt':
+        shutil.copy('${PADDLE2ONNX_SHARED_LIB}', libs_path)
+        package_data['paddle.libs']+=['paddle2onnx.dll', 'onnxruntime.dll']
+    else:
+        shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
+        if sys.platform == 'darwin':
+            package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
+        else:
+            package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
+
 if '${WITH_XPU}' == 'ON':
     # only change rpath in Release mode,
     if '${CMAKE_BUILD_TYPE}' == 'Release':
@@ -576,11 +590,14 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # phi api
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # phi common headers
     # phi level api headers (low level api)
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi')) +  # phi extension header
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/include', recursive=True)) +  # phi include headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # phi backends headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) +  # phi infermeta headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) +  # phi kernels headers
     # utila api headers
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -625,8 +642,6 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'device_ext.h' in header:
-                install_dir = "paddle/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 80efd32ecf14eebac990dd8a531c134e95e7c039..0937ebe5343fcd263321d2b31bda9e16d12e1f69 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -43,20 +43,33 @@ function match_cu_file_directory {
   do
     [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0
   done
-  for sub_dir in "" "/gpu" "/hybird"
+  for sub_dir in "" "/gpu" "/gpudnn" "/sparse/gpu"
   do
     [ "${cu_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 0
   done
   return 1
 }
 
+# Limit h file directory
+function match_h_file_directory {
+  LOG "[INFO] run function match_h_file_directory"
+  local sub_dir h_file_dir
+  h_file_dir=$(dirname ${1})
+  # '.h' file should not in directory below
+  for sub_dir in "" "/cpu"
+  do
+    [ "${h_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 1
+  done
+  return 0
+}
+
 # Load op files by header file
 function load_CHANGE_OP_FILES_by_header_file {
   LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file"
   local change_file
   for change_file in $(grep -rl "${1}" paddle/fluid/operators paddle/phi/kernels/)
   do
-    if [[ "$change_file" =~ "_op.cu" ]]
+    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" ||  "$change_file" =~ "_kernel_gpudnn.cu" ]]
     then
       # match cu file directory limit
       match_cu_file_directory $change_file || continue
@@ -64,6 +77,7 @@ function load_CHANGE_OP_FILES_by_header_file {
       CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
     elif [[ "$change_file" =~ ".h" ]]
     then
+      match_h_file_directory $change_file || continue
       [ -n "${INCLUDE_SEARCH_MAP[$change_file]}" ] && continue
       LOG "[INFO] Found \"${1}\" include by \"${change_file}\", keep searching."
       INCLUDE_SEARCH_MAP[$change_file]="searched"
@@ -82,7 +96,7 @@ function load_CHANGE_OP_FILES {
     # match directory limit
     [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/phi/kernels/" ]]  || continue
     # match file name limit
-    if [[ "$change_file" =~ "_op.cu" ]]
+    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]]
     then
       # match cu file directory limit
       match_cu_file_directory $change_file || continue
@@ -90,6 +104,7 @@ function load_CHANGE_OP_FILES {
       CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
     elif [[ "$change_file" =~ ".h" ]]
     then
+      match_h_file_directory $change_file || continue
       LOG "[INFO] Found \"${change_file}\" changed, keep searching."
       INCLUDE_SEARCH_MAP[${change_file}]="searched"
       load_CHANGE_OP_FILES_by_header_file $change_file
@@ -106,7 +121,7 @@ function prepare_benchmark_environment {
   [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1
   LOG "[INFO] Collect api info ..."
   python benchmark/api/deploy/collect_api_info.py \
-      --test_module_name tests_v2                 \
+      --test_module_name dynamic_tests_v2         \
       --info_file api_info.txt >& 2
   [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1
   [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1
@@ -131,6 +146,8 @@ function load_CHANGE_OP_MAP {
       op_name=${change_file_name##*/}
       op_name=${op_name%_cudnn_op*}
       op_name=${op_name%_op*}
+      op_name=${op_name%_grad_kernel*}
+      op_name=${op_name%_kernel*}
       [ -n "${SKIP_OP_MAP[$op_name]}" ] && continue
       LOG "[INFO] Load op: \"${op_name}\"."
       CHANGE_OP_MAP[${op_name}]="$change_file"
@@ -185,7 +202,7 @@ function run_op_benchmark_test {
     logs_dir="$(pwd)/logs-${branch_name}"
     [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir
     pushd benchmark/api > /dev/null
-    bash deploy/main_control.sh tests_v2 \
+    bash deploy/main_control.sh dynamic_tests_v2 \
                                 tests_v2/configs \
                                 $logs_dir \
                                 $VISIBLE_DEVICES \
@@ -212,7 +229,7 @@ function check_op_benchmark_result {
       # there is no need to recompile and install paddle
       LOG "[INFO] retry ${retry_time} times ..."
       pushd benchmark/api > /dev/null
-      bash deploy/main_control.sh tests_v2 \
+      bash deploy/main_control.sh dynamic_tests_v2 \
                                   tests_v2/configs \
                                   ${logs_dir} \
                                   $VISIBLE_DEVICES \
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 68bd841c375098e4f1e826cf8e36685cbf3b3c7c..715bd34b908bebacb24d13543322e1d9b27b357d 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -1,11 +1,12 @@
 # A image for building paddle binaries
-# docker build -f Dockerfile.ipu -t paddlepaddle/paddle:latest-ipu-dev .
 
-# /usr/bin/docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK \
-# --device=/dev/infiniband/ --ipc=host --name paddle-with-dev -v $PWD:/paddle \
-# -it paddlepaddle/paddle:latest-ipu-dev /bin/bash
+# build docker image
+# docker build -t paddlepaddle/paddle:ipu-dev-2.3.0 -f tools/dockerfile/Dockerfile.ipu .
 
-FROM graphcore/poplar:latest
+# run a container
+# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:ipu-dev-2.3.0 bash
+
+FROM graphcore/poplar:2.3.0
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 83c758d0aa8b8faf35a21fe8b8b794745669b147..424169bbc512798c66c9422f1f92bf243caaae57 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,155 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-PADDLE_ROOT=/home
-mkdir ${PADDLE_ROOT}
-cd ${PADDLE_ROOT}
-pip install /paddle/build/opt/paddle/share/wheels/*.whl
-git clone https://github.com/PaddlePaddle/FluidDoc
-git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-cd  ${PADDLE_ROOT}/PaddlePaddle.org
-git reset 3feaa68376d8423e41d076814e901e6bf108c705
-cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api
-sh gen_doc.sh
-apt-get update && apt-get install -y python-dev build-essential
-cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
-pip install -r requirements.txt
-#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI.
-sed -i "s#8000#$1#g" runserver
-nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc &
+is_shell_attribute_set() { # attribute, like "x"
+  case "$-" in
+    *"$1"*) return 0 ;;
+    *)    return 1 ;;
+  esac
+}
+function get_docs_pr_num_from_paddle_pr_info(){
+    # get_repo_pr_info's output
+    pr_info_file=$1
+    if [ ! -r ${pr_info_file} ] ; then
+        return 1
+    fi
+
+    declare -A arr_kv
+    while read line
+    do
+        echo "$line" | grep '^\w\+\s*=\s*.*' > /dev/null
+        if [ $? = 0 ] ; then
+            kv=($(echo $line | sed 's/=/\n/g'))
+            k=($(echo "${kv[0]}" | sed 's/\s//g'))
+            v=($(echo "${kv[1]}" | sed 's/^\s*//g' | sed 's/\s*$//g'))
+            # arr_kv[${kv[1]}]=${kv[2]}
+            arr_kv[${k}]=${v}
+        fi
+    done < <(jq -r '.body' ${pr_info_file})
+
+    echo ${arr_kv[PADDLEDOCS_PR]}
+    return 0
+}
+
+# Attention:
+# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. 
+# 2. And /docs is used as the output of doc-build process.
+# 3. If conflicted with yours, please modify the defination of FLUIDDOCDIR and
+#    OUTPUTDIR in the subsequent codes.
+# 4. The doc-build process is controlled under EnvVar BUILD_DOC and UPLOAD_DOC.
+#    All the Chinese and English docs will be generated, and then uploaded.
+
+PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: None"
+BUILD_DOC=${BUILD_DOC:=false}
+UPLOAD_DOC=${UPLOAD_DOC:=false}
+
+CURPWD=${PWD}
+
+if [ -f /usr/local/python3.7.0/bin/sphinx-build ] ; then
+    if [ -f /usr/local/bin/sphinx-build ] ; then
+        rm /usr/local/bin/sphinx-build
+    fi
+    ln -s /usr/local/python3.7.0/bin/sphinx-build /usr/local/bin/sphinx-build
+fi
+
+if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
+    export FLUIDDOCDIR=${FLUIDDOCDIR:=/FluidDoc}
+    export OUTPUTDIR=${OUTPUTDIR:=/docs}
+    export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g')
+
+    if [ -d ${FLUIDDOCDIR} ] ; then
+        echo "${FLUIDDOCDIR} exists, git clone will be skipped, but git clean will be done."
+        cd ${FLUIDDOCDIR}
+        git reset --hard
+        git clean -dfx
+        cd ${CURPWD}
+    else
+        git clone -b ${BRANCH} --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        if [ ! "$?" = "0" ] ; then
+            git clone --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        fi
+    fi
+    if [ -d ${OUTPUTDIR} ] ; then
+        echo "$0: rm -rf ${OUTPUTDIR}"
+        rm -rf ${OUTPUTDIR}
+        mkdir -p ${OUTPUTDIR}
+    fi
+
+    # install requirements
+    export no_proxy=mirror.baidu.com,${no_proxy}
+    apt-get install -y --no-install-recommends doxygen jq
+    echo 'beautifulsoup4
+Markdown
+sphinx-sitemap
+sphinx-markdown-tables
+breathe
+exhale
+sphinx_design
+nbsphinx
+' >/tmp/doc-build.requirements && \
+    pip install --no-cache-dir -i https://mirror.baidu.com/pypi/simple -r /tmp/doc-build.requirements && \
+    rm /tmp/doc-build.requirements
+
+
+    source ${FLUIDDOCDIR}/ci_scripts/utils.sh
+    paddle_pr_info=$(get_repo_pr_info "PaddlePaddle/Paddle" ${GIT_PR_ID})
+    docs_pr_id=$(get_docs_pr_num_from_paddle_pr_info ${paddle_pr_info})
+    if [ -n "${docs_pr_id}" ] ; then
+        cd ${FLUIDDOCDIR}
+        git fetch --depth=1 origin pull/${docs_pr_id}/head
+        git checkout -b "pr${docs_pr_id}" FETCH_HEAD
+        git log --pretty=oneline -10
+    fi
+    echo "docs_pr_id=${docs_pr_id}"
+
+
+    # build doc
+    /bin/bash -x ${FLUIDDOCDIR}/ci_scripts/gendoc.sh
+    if [ $? -ne 0 ];then
+        echo 'gendoc error'
+        exit 1
+    fi
+
+    if [ "${UPLOAD_DOC}" = "true" ] ; then
+        curl -o /tmp/linux-bcecmd-0.3.0.zip https://sdk.bce.baidu.com/console-sdk/linux-bcecmd-0.3.0.zip && \
+        python -m zipfile -e /tmp/linux-bcecmd-0.3.0.zip /opt && \
+        chmod +x /opt/linux-bcecmd-0.3.0/bcecmd && \
+        rm /tmp/linux-bcecmd-0.3.0.zip && \
+        curl -o /tmp/boscmdconfig.tgz https://paddle-dev-tools-open.bj.bcebos.com/fluiddoc-preview/boscmdconfig.tgz && \
+        tar xzf /tmp/boscmdconfig.tgz -C /opt/linux-bcecmd-0.3.0/ && \
+        rm /tmp/boscmdconfig.tgz
+
+        # credentials file is empty, please build it if need.
+        BCECMD=/opt/linux-bcecmd-0.3.0/bcecmd
+        BCECMD_CONFIG=/opt/linux-bcecmd-0.3.0/boscmdconfig
+
+        is_shell_attribute_set x
+        xdebug_setted=$?
+        if [ $xdebug_setted ] ; then
+            set +x
+        fi
+        if [ -n "${BOS_CREDENTIAL_AK}" ] && [ -n "${BOS_CREDENTIAL_SK}" ] ; then
+            echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials
+            echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials
+        fi
+        if [ $xdebug_setted ] ; then
+            set -x
+        fi
+
+        PREVIEW_JOB_NAME="preview-paddle-pr-${GIT_PR_ID}"
+        BOSBUCKET=${BOSBUCKET:=paddle-site-web-dev}
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: http://${PREVIEW_JOB_NAME}.${PREVIEW_SITE:-paddle.run}/documentation/docs/zh/api/index_cn.html"
+    fi
+fi
+
+cd ${CURPWD}
+# print the preview url
+echo "${PREVIEW_URL_PROMPT}"
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index 83e2957831296932242e984f83110fba897e5c03..f754767259563f2cd64bac92adf76249b18af11f 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -1,4 +1,4 @@
-def PD_FeedOp : PD_Op<"feed"> {
+def PD_FeedOp : PD_Op<"feed", [NoSideEffect]> {
   let summary = "Feed Op";
 
   let description = [{
@@ -33,7 +33,7 @@ def PD_ReturnOp : PD_Op<"return", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py
index 03cf6828cc7e19eedd7b2cd1375b93859e9f3cfa..0d633cfc60a9b6cddc669da0dbc87667f8211714 100644
--- a/tools/infrt/fake_models/multi_fc.py
+++ b/tools/infrt/fake_models/multi_fc.py
@@ -19,7 +19,6 @@ import sys, os
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.backward import append_backward
 
 size = 2
 num_layers = 4
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
new file mode 100644
index 0000000000000000000000000000000000000000..36561d4e71da8b669f1e06b0240a4d4b3b2ca92e
--- /dev/null
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+import os
+from get_compat_kernel_signature import get_compat_kernels_info
+
+#TODO @DannyIsFunny: more attr types need to be supported.
+attr_type_converter = {
+    "i": 'SI32Attr',
+    "b": 'BoolAttr',
+    "l": 'SI64Attr',
+    "f": 'F32Attr'
+}
+
+target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
+layout_type_converter = {
+    "NCHW": "NCHW",
+    "NHWC": "NHWC",
+    "Undefined(AnyLayout)": "ANY"
+}
+precision_type_converter = {
+    "uint8": "UINT8",
+    "int8": "INT8",
+    "int16": "INT16",
+    "int32": "INT32",
+    "int64": "INT64",
+    "float16": "FLOAT16",
+    "bfloat16": "BFLOAT16",
+    "float32": "FLOAT32",
+    "float64": "FLOAT64",
+    "complex64": "COMPLEX64",
+    "complex128": "COMPLEX128",
+    "bool": "BOOL"
+}
+
+kernel_types_info_file = "./kernels.json"
+kernel_signature_info_file = "./kernel_signature.json"
+
+
+def generate_kernel_name(op_name, place_str):
+    [target_, layout_, precision_] = place_str[1:-1].split(',')
+    target_ = target_type_converter[target_.strip()]
+    layout_ = layout_type_converter[layout_.strip()]
+    precision_ = precision_type_converter[precision_.strip()]
+    class_name_ = "{}{}".format(
+        op_name.replace("_", "").title(), "".join([
+            target_.strip().title(), precision_.strip(), layout_.strip().title()
+            .title()
+        ]))
+    alias_ = "{}.{}".format(op_name, ".".join(
+        [target_.strip(), precision_.strip(), layout_.strip()]))
+    return alias_, class_name_
+
+
+def generate_attrs_info(op_name, attrs_info):
+    kernel_attrs_names = {}
+    attrs_args_ = ""
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
+    if len(kernel_attrs_names[op_name]["attrs"]) == len(attrs_info):
+        for index in range(len(attrs_info)):
+            attr_name = kernel_attrs_names[op_name]["attrs"][index]
+            attr_type = attr_type_converter[attrs_info[index]]
+            attrs_args_ += '{type_}:${name_},'.format(
+                type_=attr_type, name_=attr_name)
+    return attrs_args_[:-1]
+
+
+def generate_inputs_info(input_info):
+    input_args_ = ""
+    for index in range(len(input_info)):
+        [target_, layout_, precision_] = input_info[index].split(',')
+        # todo: check vadility
+        target_ = target_type_converter[target_.strip()]
+        layout_ = layout_type_converter[layout_.strip()]
+        precision_ = precision_type_converter[precision_.strip()]
+        input_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$in{},".format(
+            target_.strip(), precision_.strip(), layout_.strip(), str(index))
+    input_args_ = input_args_[:-1]
+    return input_args_
+
+
+def generate_arguments_info(op_name, input_info, attr_info):
+    input_args = generate_inputs_info(input_info)
+    attr_args = generate_attrs_info(op_name, attr_info)
+    context_args = "Context:$dev_ctx"
+    argument_list = [context_args] + input_args.split(",") + attr_args.split(
+        ",")
+    while ("" in argument_list):
+        argument_list.remove("")
+    argument_ = ",".join(argument_list)
+    return (("let arguments = (ins {});".format(argument_.strip(","))))
+
+
+def generate_results_info(output_info):
+    output_args_ = "let results = (outs "
+    for index in range(len(output_info)):
+        [target_, layout_, precision_] = output_info[index].split(',')
+        # todo: check vadility
+        target_ = target_type_converter[target_.strip()]
+        layout_ = layout_type_converter[layout_.strip()]
+        precision_ = precision_type_converter[precision_.strip()]
+        output_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$out{},".format(
+            target_.strip(), precision_.strip(), layout_.strip(), str(index))
+    return ("{});".format(output_args_[:-1]))
+
+
+def generate_supported_kernel_list(load_dict):
+    supported_kernels_list_ = []
+    kernel_attrs_names = {}
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
+    for op_name in load_dict:
+        kernel_list = load_dict[op_name]
+        for kernel_info in kernel_list:
+            for kernel_alias_ in kernel_info:
+                attributes = kernel_info[kernel_alias_]["attribute"]
+                flag = True
+                for attribute in attributes:
+                    if attribute not in attr_type_converter:
+                        flag = False
+                if flag and op_name in kernel_attrs_names:
+                    supported_kernels_list_.append(op_name)
+    supported_kernels_list_ = list(set(supported_kernels_list_))
+    return supported_kernels_list_
+
+
+def scan_kernel_info(load_dict):
+    target_type_ = []
+    layout_type_ = []
+    precision_type_ = []
+    for op_name in load_dict:
+        kernel_list = load_dict[op_name]
+        for kernel_info in kernel_list:
+            for kernel_alias_ in kernel_info:
+                [target_, layout_, precision_] = kernel_alias_[1:-1].split(',')
+                target_type_.append(target_.strip())
+                layout_type_.append(layout_.strip())
+                precision_type_.append(precision_.strip())
+    target_type_ = list(set(target_type_))
+    layout_type_ = list(set(layout_type_))
+    precision_type_ = list(set(precision_type_))
+    print(target_type_)
+    print(layout_type_)
+    print(precision_type_)
+
+
+def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
+
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
+    summary = 'let summary = "{name}";'.format(name=alias)
+    dialect_name = alias.split(".")
+    dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
+        3]
+
+    header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
+
+    inputs_ = kernel_info["input"]
+    attributes = kernel_info["attribute"]
+    arguments = generate_arguments_info(op_name, inputs_, attributes)
+
+    outputs = kernel_info["output"]
+    results = generate_results_info(outputs)
+
+    kernel_dialect = '{header_}\n  {summary_}\n  {arguments_}\n  {results_}\n{right_brace}\n'.format(
+        header_=header,
+        summary_=summary,
+        arguments_=arguments,
+        results_=results,
+        right_brace="}")
+    return kernel_dialect
+
+
+def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
+
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
+    summary = 'let summary = "{name}";'.format(name=alias)
+    dialect_name = alias.split(".")
+    dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
+        3]
+
+    header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
+    inputs_ = kernel_info["input"]
+    attributes = kernel_info["attribute"]
+    arguments = generate_arguments_info(op_name, inputs_, attributes)
+
+    outputs = kernel_info["output"]
+    results = generate_results_info(outputs)
+
+    kernel_dialect = '{header_}\n  {summary_}\n  {arguments_}\n  {results_}\n{right_brace}\n'.format(
+        header_=header,
+        summary_=summary,
+        arguments_=arguments,
+        results_=results,
+        right_brace="}")
+    return kernel_dialect
+
+
+def generate_dialect_head():
+    comment_ = "/*===- TableGen'source file -----------------------------------------------===*\\\n\
+|*                                                                            *|\n\
+|* Kernel Definitions                                                         *|\n\
+|*                                                                            *|\n\
+|* Automatically generated file, do not edit!                                 *|\n\
+|* Generated by tools/infrt/generate_pten_kernel_dialect.py                   *|\n\
+|*                                                                            *|\n\
+\*===----------------------------------------------------------------------===*/\n"
+
+    includes_ = "#ifndef PTEN_KERNELS\n\
+#define PTEN_KERNELS\n\
+include \"mlir/Interfaces/InferTypeOpInterface.td\"\n\
+include \"mlir/Interfaces/LoopLikeInterface.td\"\n\
+include \"mlir/IR/OpBase.td\"\n\
+include \"paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td\""
+
+    return (comment_ + includes_)
+
+
+def get_kernel_target(kernel_alias_):
+    target = kernel_alias_[1:-1].split(",")
+    return target[0]
+
+
+def main():
+    with open(kernel_types_info_file, "r") as f:
+        load_dict = json.load(f)
+
+        head = generate_dialect_head()
+
+        cpu_registry_ = ""
+        gpu_registry_ = ""
+        supported_kernels = generate_supported_kernel_list(load_dict)
+        print("Supported kernels:")
+        print(supported_kernels)
+        for op_name in load_dict:
+            if op_name not in supported_kernels:
+                continue
+            kernel_list = load_dict[op_name]
+            for kernel_info in kernel_list:
+                for kernel_alias_ in kernel_info:
+                    if get_kernel_target(kernel_alias_) == "CPU":
+                        kernel_registry = generate_cpu_kernel_dialect(
+                            op_name, kernel_alias_, kernel_info[kernel_alias_])
+                        cpu_registry_ += kernel_registry
+                    elif get_kernel_target(kernel_alias_) == "GPU":
+                        kernel_registry = generate_gpu_kernel_dialect(
+                            op_name, kernel_alias_, kernel_info[kernel_alias_])
+                        gpu_registry_ += kernel_registry
+                    else:
+                        print("Unsupported backend:" + get_kernel_target(
+                            kernel_alias_))
+        end = "#endif  // PTEN_KERNELS"
+        with open("../../paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td",
+                  "w") as dst:
+            dst.write('{start_}\n{dialect_}\n{end_}'.format(
+                start_=head, dialect_=cpu_registry_, end_=end))
+        with open("../../paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td",
+                  "w") as dst:
+            dst.write('{start_}\n{dialect_}\n{end_}'.format(
+                start_=head, dialect_=gpu_registry_, end_=end))
+
+
+if __name__ == '__main__':
+    if not os.path.exists(kernel_types_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_types_info_file))
+    if not os.path.exists(kernel_signature_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_signature_info_file))
+    if os.path.exists(kernel_types_info_file) and os.path.exists(
+            kernel_signature_info_file):
+        main()
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d59c2aef10be6db99c7947e8dc238e5463fb47
--- /dev/null
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import json
+
+
+def parse_compat_registry(kernel_info):
+    name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{")
+    kernel_info = {}
+    kernel_info["inputs"] = inputs_str[:-1].split(",")
+    kernel_info["attrs"] = attrs_str[:-1].split(",")
+    kernel_info["outputs"] = outputs_str[:-1].split(",")
+    return name, kernel_info
+
+
+def remove_grad_registry(kernels_registry):
+    clean_kernel_registry = {}
+    for registry in kernels_registry:
+        if (not "_grad" in registry):
+            clean_kernel_registry[registry] = kernels_registry[registry]
+    return clean_kernel_registry
+
+
+def get_compat_kernels_info():
+    kernels_info = {}
+    compat_files = os.listdir("../../paddle/phi/ops/compat")
+    for file_ in compat_files:
+        if not ".cc" in file_:
+            compat_files.remove(file_)
+
+    for file_ in compat_files:
+        with open("../../paddle/phi/ops/compat/" + file_) as in_file:
+            txt = in_file.readlines()
+            content = ""
+            registry = False
+            for line in txt:
+                if ("KernelSignature(" in line):
+                    content = ""
+                    registry = True
+                if (registry):
+                    content += line
+                if (registry and ";" in line):
+                    data = content.replace("\n", "").replace(
+                        " ", "").strip("return").strip(
+                            "KernelSignature(").strip("\);").replace("\"", "")
+                    registry = False
+                    name, registry_info = parse_compat_registry(data)
+
+                    if name in kernels_info:
+                        cur_reg = kernels_info[name]
+                        kernels_info[name]["inputs"] = list(
+                            set(registry_info["inputs"] + kernels_info[name][
+                                "inputs"]))
+                        kernels_info[name]["attrs"] = list(
+                            set(registry_info["attrs"] + kernels_info[name][
+                                "attrs"]))
+                        kernels_info[name]["outputs"] = list(
+                            set(registry_info["outputs"] + kernels_info[name][
+                                "outputs"]))
+                    else:
+                        kernels_info[name] = registry_info
+
+    compat_registry_ = remove_grad_registry(kernels_info)
+    return compat_registry_
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index f3e9f345da27b4c4bb06499dfc14b12cbd406715..774f6cd6bf3648a0de7a34e01e893d212bce9770 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -150,19 +150,19 @@ def gen_dtype(vals: List[str]):
     ir_dtypes, origin_dtypes = [], []
     for val in vals:
         if val == "float":
-            ir_dtypes.append("fp32")
+            ir_dtypes.append("float32")
             origin_dtypes.append("float")
         elif val == "double":
-            ir_dtypes.append("fp64")
+            ir_dtypes.append("float64")
             origin_dtypes.append("double")
         elif val == "float16":
-            ir_dtypes.append("fp16")
+            ir_dtypes.append("float16")
             origin_dtypes.append("paddle::experimental::float16")
         elif val == "bfloat16":
             ir_dtypes.append("bf16")
             origin_dtypes.append("paddle::experimental::bfloat16")
         elif val == "bool":
-            ir_dtypes.append("int1")
+            ir_dtypes.append("bool")
             origin_dtypes.append("bool")
         elif val == "int8_t":
             ir_dtypes.append("int8")
@@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = 'pten.' + '.'.join(
-                [it.lower() for it in update_item[:3]]) + "." + ir_dtype
+            ir_name = 'phi_cpu.' + update_item[0].lower(
+            ) + '.' + ir_dtype + '.' + update_item[2].lower()
             res += f"""
   registry->AddKernel("{ir_name}","""
 
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 4df27bfe4e923868ac5267119c5b56b6ba3839c8..7f8e516496f32352fa18f950a4687d5b52f4d10d 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
     'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow',
     'test_inplace_softmax_with_cross_entropy', 'test_transforms',
     'test_unfold_op', 'test_assign_op', 'test_isinstance',
-    'test_conv_affine_channel_fuse_pass',
     'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op',
     'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary',
     'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op',
@@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [
     'test_dataloader_unkeep_order',
     'test_parallel_executor_profiler',
     'test_correlation',
-    'test_conv_affine_channel_fuse_pass',
     'test_ir_inplace_pass',
     'test_moving_average_abs_max_scale_op',
     'test_flatten_contiguous_range_op',
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d8cb70c9dd107b076648bd7ae032bd7f3db573df..2d8692c5bc7e5c7389af5b07afebbc6de7d96ced 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -550,6 +550,42 @@ def get_incrementapi():
                 f.write('\n')
 
 
+def exec_gen_doc():
+    result = True
+    cmd = ["bash", "document_preview.sh"]
+    logger.info("----exec gen_doc----")
+    start_time = time.time()
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+    end_time = time.time()
+
+    if subprc.returncode != 0:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.error("----gen_doc error msg----")
+        logger.error(err)
+        logger.error("----exec gen_doc failed----")
+        result = False
+    else:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.info("----exec gen_doc success----")
+
+    for fn in [
+            '/docs/en/develop/index_en.html', '/docs/zh/develop/index_cn.html'
+    ]:
+        if os.path.exists(fn):
+            logger.info('%s exists.', fn)
+        else:
+            logger.error('%s not exists.', fn)
+
+    # msg is the returned code execution report
+    return result, msg, end_time - start_time
+
+
 arguments = [
     # flags, dest, type, default, help
     ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
@@ -570,6 +606,11 @@ def parse_args():
     parser.add_argument('--debug', dest='debug', action="store_true")
     parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    parser.add_argument(
+        '--build-doc',
+        dest='build_doc',
+        action='store_true',
+        help='build doc if need.')
     for item in arguments:
         parser.add_argument(
             item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
@@ -702,3 +743,7 @@ if __name__ == '__main__':
             exit(1)
 
     logger.info("Sample code check is successful!")
+
+    if args.mode == "cpu":
+        # As cpu mode is also run with the GPU whl, so skip it in gpu mode.
+        exec_gen_doc()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 694283264ca8f63ec3bcbe73a884c6a9f280bc15..365047f7e8382afa1646df2d4ff491471fa829c2 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -578,7 +579,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_ir_embedding_eltwise_layernorm_fuse_pass',
     'test_ir_fc_fuse_pass',
     'test_ir_skip_layernorm_pass',
-    'test_conv_affine_channel_fuse_pass',
     'test_conv_bias_mkldnn_fuse_pass',
     'test_conv_bn_fuse_pass',
     'test_conv_elementwise_add2_act_fuse_pass',
@@ -730,4 +730,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_lu_op',
     'test_margin_cross_entropy_op',
     'test_pull_gpups_sparse_op',
+    'test_fused_gemm_epilogue_op',
+    'test_fused_gemm_epilogue_grad_op',
 ]